diff --git a/.cirrus.yml b/.cirrus.yml
new file mode 100644
index 000000000..02cd40997
--- /dev/null
+++ b/.cirrus.yml
@@ -0,0 +1,167 @@
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+
+task:
+  name: AppleM1/LLVM
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - make TARGET=VORTEX USE_OPENMP=1 CC=clang
+
+task:
+  name: AppleM1/LLVM/ILP64
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
+
+task:
+  name: AppleM1/LLVM/CMAKE
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - mkdir build
+  - cd build
+  - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
+  - make
+
+task:
+  name: AppleM1/GCC/MAKE/OPENMP
+  compile_script:
+  - brew install gcc@11
+  - export PATH=/opt/homebrew/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/lib"
+  - export CPPFLAGS="-I/opt/homebrew/include"
+  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
+  
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM x86_64 xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - export ARCHS="i386 x86_64"
+  - export ARCHS_STANDARD="i386 x86_64"
+  - export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
+  - export ARCHS_STANDARD_64_BIT=x86_64
+  - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
+  - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
+  - export VALID_ARCHS="i386 x86_64"
+  - xcrun --sdk macosx --show-sdk-path
+  - xcodebuild -version
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
+  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+#  lib_artifacts:
+#    path: "libopenblas*"
+#    type: application/octet-streamm
+
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM armv8-ios xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM armv7-androidndk xbuild
+  compile_script:
+  - #brew install android-ndk
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
+  - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
+  - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+
+task:
+  name: NeoverseN1
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make
+
+  task:
+  name: NeoverseN1-ILP64
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make INTERFACE64=1
+
+task:
+  name: NeoverseN1-OMP
+  arm_container:
+    image: node:latest
+    cpu: 8
+  compile_script:
+  - make USE_OPENMP=1
+
+FreeBSD_task:
+  name: FreeBSD-gcc12
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc 
+
+
+FreeBSD_task:
+  name: freebsd-gcc12-ilp64
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc INTERFACE64=1
+
+#task:
+#  name: Windows/LLVM16     --- too slow ---
+#  windows_container:
+#    image: cirrusci/windowsservercore:cmake-2021.12.07
+#  install_script: 
+#    - choco list --localonly
+#    - choco install -y llvm
+#    - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
+#    - choco install -y ninja
+#    - refreshenv
+#    - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
+#    - vcvarsall x64
+#    - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
+#    - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
+#    - cd build
+#    - cmake --build .
+#    - ctest
diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
new file mode 100644
index 000000000..199304fb1
--- /dev/null
+++ b/.github/workflows/c910v.yml
@@ -0,0 +1,121 @@
+name: c910v qemu test
+
+on: [push, pull_request]
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    runs-on: ubuntu-latest
+    env:
+      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
+      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_GENERIC
+            triple: riscv64-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
+          - target: C910V
+            triple: riscv64-unknown-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=C910V
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
+
+      - name: checkout qemu
+        uses: actions/checkout@v3
+        with:
+          repository: T-head-Semi/qemu
+          path: qemu
+          ref: 1e692ebb43d396c52352406323fc782c1ac99a42
+
+      - name: build qemu
+        run: |
+          # Force use c910v qemu-user
+          wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          cd qemu
+          patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
+          make -j$(nproc)
+          make install
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS
+        run: |
+            wget ${xuetie_toolchain}/${toolchain_file_name}
+            tar -xvf ${toolchain_file_name} -C /opt
+            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
+
+            make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: test
+        run: |
+          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
+          qemu-riscv64 ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index ff40b354d..4fe6e63fc 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -151,40 +151,53 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        msystem: [MINGW64, MINGW32, CLANG64]
+        msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
         idx: [int32, int64]
         build-type: [Release]
         include:
           - msystem: MINGW64
             idx: int32
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
           - msystem: MINGW32
             idx: int32
             target-prefix: mingw-w64-i686
-            fc-pkg: mingw-w64-i686-gcc-fortran
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int32
             target-prefix: mingw-w64-clang-x86_64
+            fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
+          - msystem: CLANG32
+            idx: int32
+            target-prefix: mingw-w64-clang-i686
+            fc-pkg: cc
             c-lapack-flags: -DC_LAPACK=ON
           - msystem: MINGW64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
-            c-lapack-flags: -DC_LAPACK=ON
+            fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
           - msystem: MINGW64
             idx: int32
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
             build-type: None
         exclude:
           - msystem: MINGW32
             idx: int64
+          - msystem: CLANG32
+            idx: int64
 
     defaults:
       run:
@@ -209,7 +222,7 @@ jobs:
           install: >-
             base-devel
             ${{ matrix.target-prefix }}-cc
-            ${{ matrix.fc-pkg }}
+            ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
             ${{ matrix.target-prefix }}-cmake
             ${{ matrix.target-prefix }}-ninja
             ${{ matrix.target-prefix }}-ccache
@@ -217,14 +230,21 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v3
 
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          # It looks like this path needs to be hard-coded.
-          path: C:/msys64/home/runneradmin/.ccache
+      - name: Prepare ccache
+        # Get cache location of ccache
+        # Create key that is used in action/cache/restore and action/cache/save steps
+        id: ccache-prepare
+        run: |
+          echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
           # We include the commit sha in the cache key, as new cache entries are
           # only created if there is no existing entry for the key yet.
-          key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
+          echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
+
+      - name: Restore ccache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
           # Restore a matching ccache cache entry. Prefer same branch.
           restore-keys: |
             ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
@@ -234,9 +254,10 @@ jobs:
         # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
         run: |
           which ccache
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 250M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
+          test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
+          echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          ccache -p
           ccache -s
           echo $HOME
           cygpath -w $HOME
@@ -253,6 +274,7 @@ jobs:
                 -DTARGET=CORE2 \
                 ${{ matrix.idx64-flags }} \
                 ${{ matrix.c-lapack-flags }} \
+                ${{ matrix.no-avx512-flags }} \
                 -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                 -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                 ..
@@ -264,10 +286,30 @@ jobs:
         continue-on-error: true
         run: ccache -s
 
+      - name: Save ccache
+        # Save the cache after we are done (successfully) building
+        uses: actions/cache/save@v3
+        with:
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
+
       - name: Run tests
+        id: run-ctest
         timeout-minutes: 60
         run: cd build && ctest
 
+      - name: Re-run tests
+        if: always() && (steps.run-ctest.outcome == 'failure')
+        timeout-minutes: 60
+        run: |
+          cd build
+          echo "::group::Re-run ctest"
+          ctest --rerun-failed --output-on-failure || true
+          echo "::endgroup::"
+          echo "::group::Log from these tests"
+          [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
+          echo "::endgroup::"
+
 
   cross_build:
     runs-on: ubuntu-22.04
@@ -295,6 +337,7 @@ jobs:
 
       - name: Install Dependencies
         run: |
+          sudo apt-get update
           sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
 
       - name: Compilation cache
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
new file mode 100644
index 000000000..5501e98e0
--- /dev/null
+++ b/.github/workflows/loongarch64.yml
@@ -0,0 +1,110 @@
+name: loongarch64 qemu test
+
+on: [push, pull_request]
+
+jobs:
+  TEST:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: LOONGSONGENERIC
+            triple:  loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
+          - target: LOONGSON3R5
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSON3R5
+          - target: LOONGSON2K1000
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install APT deps
+        run: |
+          sudo add-apt-repository ppa:savoury1/virtualisation
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          qemu-user-static
+
+      - name: Download and install loongarch64-toolchain
+        run: |
+          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
+          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
+
+      - name: Set env
+        run: |
+          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Disable utest dsdot:dsdot_n_1
+        run: |
+          echo -n > utest/test_dsdot.c
+          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
+          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
+
+      - name: Build OpenBLAS
+        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: Test
+        run: |
+          qemu-loongarch64-static ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
diff --git a/.gitignore b/.gitignore
index 0fe20ecaa..8b27325db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ lapack-3.4.2
 lapack-3.4.2.tgz
 lapack-netlib/make.inc
 lapack-netlib/lapacke/include/lapacke_mangling.h
+lapack-netlib/SRC/la_constants.mod
 lapack-netlib/TESTING/testing_results.txt
 lapack-netlib/INSTALL/test*
 lapack-netlib/TESTING/xeigtstc
@@ -71,6 +72,7 @@ test/SBLAT3.SUMM
 test/ZBLAT2.SUMM
 test/ZBLAT3.SUMM
 test/SHBLAT3.SUMM
+test/SBBLAT3.SUMM
 test/cblat1
 test/cblat2
 test/cblat3
@@ -81,6 +83,7 @@ test/sblat1
 test/sblat2
 test/sblat3
 test/test_shgemm
+test/test_sbgemm
 test/zblat1
 test/zblat2
 test/zblat3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 502bf7a9d..86758d8b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 23)
+set(OpenBLAS_PATCH_VERSION 23.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
@@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
 #######
 option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
 
+option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
+
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
 
 option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
@@ -309,19 +311,25 @@ endif()
 
 #if (MSVC OR NOT NOFORTRAN)
 if (NOT NO_CBLAS)
+  if (NOT ONLY_CBLAS)
   # Broken without fortran on unix
-  add_subdirectory(utest)
+    add_subdirectory(utest)
+endif()
 endif()
 
 if (NOT NOFORTRAN)
+ if (NOT ONLY_CBLAS)
   # Build test and ctest
   add_subdirectory(test)
+ endif()
   if (BUILD_TESTING)
     add_subdirectory(lapack-netlib/TESTING)
   endif()
 endif()
   if(NOT NO_CBLAS)
+   if (NOT ONLY_CBLAS)
     add_subdirectory(ctest)
+   endif()
   endif()
   if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
     add_subdirectory(cpp_thread_test)
@@ -398,15 +406,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
     message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
   endif()
 
+  if (${BUILD_LAPACK_DEPRECATED})
+	  set (BLD 1)
+  else ()
+	  set (BLD 0)
+  endif()
+  if (${BUILD_BFLOAT16})
+	  set (BBF16 1)
+  else ()
+	  set (BBF16 0)
+  endif()
+  if (${BUILD_SINGLE})
+	  set (BS 1)
+  else ()
+	  set (BS 0)
+  endif()
+  if (${BUILD_DOUBLE})
+	  set (BD 1)
+  else ()
+	  set (BD 0)
+  endif()
+  if (${BUILD_COMPLEX})
+	  set (BC 1)
+  else ()
+	  set (BC 0)
+  endif()
+  if (${BUILD_COMPLEX16})
+	  set (BZ 1)
+  else ()
+	  set (BZ 0)
+  endif()
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+	  COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   else()
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
@@ -511,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
 install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
 
 
-# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
 set(PN OpenBLAS)
-set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
+set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
 configure_package_config_file(cmake/${PN}Config.cmake.in
                               "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
                               INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f5e9dda91..71df13634 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,9 @@
   * Optimization on AMD Piledriver
   * Optimization on Intel Haswell
 
+* Chris Sidebottom <chris.sidebottom@arm.com>
+  * Optimizations and other improvements targeting AArch64
+
 ## Previous Developers
 
 * Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -212,4 +215,4 @@ In chronological order:
   * [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
   
 * Pablo Romero <https://github.com/pablorcum>
-  * [2022-08] Fix building from sources for QNX
\ No newline at end of file
+  * [2022-08] Fix building from sources for QNX
diff --git a/Changelog.txt b/Changelog.txt
index aa445ae82..3937ef08c 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,104 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.24
+ 03-Sep-2023
+
+general:
+   - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation 
+     and others, the previous discrepancy appears to have dated back to GotoBLAS)
+   - fixed the implementation of ?GEMMT that was added in 0.3.23
+   - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
+   - fixed application of SYMBOLSUFFIX in CMAKE builds
+   - fixed missing SSYCONVF function in the shared library
+   - fixed parallel build logic used with gmake
+   - added support for compilation with LLVM17, in particular its new Fortran compiler
+   - added support for CMAKE builds using the NVIDIA HPC compiler
+   - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
+   - fixed cross-build detection and management in c_check
+   - disabled building of the tests with CMAKE when ONLY_CBLAS is defined
+   - fixed several issues with the handling of runtime limits on the number of OPENMP threads
+   - corrected the error code returned by SGEADD/DGEADD when LDA is too small
+   - corrected the error code returned by IMATCOPY when LDB is too small
+   - updated ?NRM2 to support negative increment values (as introduced in release 3.10 
+     of the reference BLAS)
+   - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
+   - fixed a potential overwrite of unrelated memory during thread initialisation on startup
+   - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
+   - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
+   - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
+   - applied additions and corrections from the development branch of Reference-LAPACK:
+   - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
+   - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
+   - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
+   - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
+   - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
+	- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
+	- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
+	- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
+	- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
+	- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
+	- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
+	- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
+	- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
+	- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
+	- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
+	- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
+	- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
+	- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
+	- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
+	- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
+	- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
+	- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
+	- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
+	- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
+
+x86_64:
+   - added cpu model autodetection for Intel Alder Lake N
+   - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
+   - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
+   - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
+   - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
+   - fixed feature-based cputype fallback in DYNAMIC_ARCH
+   - added support for building the AVX512 kernels with the NVIDIA HPC compiler
+   - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
+   - fixed a potential use of uninitialized variables in ZTRSM
+
+ARM64:
+   - added cpu model autodetection for Apple M2
+   - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
+   - added support for building the SVE kernels with the NVIDIA HPC compiler
+   - added support for building the SVE kernels with the Apple Clang compiler
+   - fixed compiler option handling for building the SVE kernels with LLVM
+   - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
+   - activated SVE SGEMM and DGEMM kernels for Neoverse V1
+   - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
+   - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
+   - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
+     container restrictions into account
+   - fixed a potential use of uninitialized variables in ZTRSM
+   - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
+
+LOONGARCH64:
+   - added ABI detection
+   - added support for cpu affinity handling
+   - fixed compilation with early versions of the Loongson toolchain
+   - added an optimized SGEMM kernel for 3A5000
+   - added optimized DGEMV kernels for 3A5000
+   - improved the performance of the DGEMM kernel for 3A5000
+
+MIPS64:
+   - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
+
+POWER:
+   - fixed compiler warnings in the POWER10 SBGEMM kernel
+
+RISCV:
+   - fixed application of the INTERFACE64 option when building with CMAKE
+   - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
+   - fixed IDAMAX and DOT kernels for C910V
+   - fixed corner cases in the ROT and SWAP kernels for C910V
+   - fixed compilation of the C910V target with recent vendor compilers
+
 ====================================================================
 Version 0.3.23
  01-Apr-2023
diff --git a/Jenkinsfile b/Jenkinsfile
index 2b61bed9f..baeeee59f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,9 +1,14 @@
-node {
-        stage('Checkout') {
-            checkout
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-s390x'
         }
-
+    }
+    stages {
         stage('Build') {
-            sh("make")
+            steps {
+                sh 'make clean && make'
+            }
         }
+    }
 }
diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr
new file mode 100644
index 000000000..96e18b8ad
--- /dev/null
+++ b/Jenkinsfile.pwr
@@ -0,0 +1,16 @@
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-ppc64le'
+        }
+    }
+    stages {
+        stage('Build') {
+            steps {
+                sh 'sudo apt update'
+                sh 'sudo apt install gfortran -y'
+                sh 'make clean && make'
+            }
+        }
+    }
+}
diff --git a/Makefile b/Makefile
index 144b3400d..299970c67 100644
--- a/Makefile
+++ b/Makefile
@@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
 .PHONY : all libs netlib $(RELA) test ctest shared install
-.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
+.NOTPARALLEL : shared
 
-all :: libs netlib $(RELA) tests shared
+all :: tests
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
 endif
 endif
 
-tests : libs netlib $(RELA) shared
+tests : shared
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
@@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
 	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
 endif
 
-lapack-runtest:
+lapack-runtest: lapack-test
 	( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
         ./testsecond; ./testdsecnd; ./testieee; ./testversion )
-	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
 
 
 blas-test:
diff --git a/Makefile.arm64 b/Makefile.arm64
index 064e84cbb..1b10446f7 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -69,7 +69,7 @@ endif
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@@ -92,9 +92,14 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.4-a+sve
+ifeq (1, $(ISCLANG))
+CCOMMON_OPT += -mtune=cortex-x1
+else
+CCOMMON_OPT += -mtune=neoverse-v1
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
@@ -122,8 +127,8 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
@@ -155,7 +160,7 @@ endif
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ8), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@@ -196,8 +201,13 @@ endif
 endif
 
 ifeq ($(CORE), THUNDERX3T110)
-ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.3-a 
+ifeq (0, $(ISCLANG))
+CCOMMON_OPT += -mtune=thunderx3t110
+else
+CCOMMON_OPT += -mtune=thunderx2t99
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
 endif
@@ -225,9 +235,12 @@ endif
 endif
 endif
 
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), EMAG8180)
-CCOMMON_OPT += -march=armv8-a -mtune=emag
+CCOMMON_OPT += -march=armv8-a
+ifeq  ($(ISCLANG), 0)
+CCOMMON_OPT += -mtune=emag
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif
diff --git a/Makefile.rule b/Makefile.rule
index ab46fd075..e210e49e8 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.23
+VERSION = 0.3.23.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index 343b94bb3..b3968d739 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
 
+ifeq ($(C_COMPILER), CLANG)
+CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
+endif
+
 #
 #  OS dependent settings
 #
@@ -645,7 +650,7 @@ DYNAMIC_CORE += HASWELL ZEN
 endif
 ifneq ($(NO_AVX512), 1)
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += SKYLAKEX COOPERLAKE
+DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
 endif
 endif
 endif
@@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
+DYNAMIC_CORE += ARMV8SVE
 endif
 DYNAMIC_CORE += CORTEXA55
 DYNAMIC_CORE += FALKOR
@@ -932,8 +938,12 @@ BINARY_DEFINED = 1
 endif
 
 ifeq ($(ARCH), loongarch64)
-CCOMMON_OPT += -march=loongarch64 -mabi=lp64
-FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
+ifneq ($(LA64_ABI), lp64d)
+LA64_ABI=lp64
+endif
+CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
+FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
 endif
 
 endif
@@ -1082,8 +1092,9 @@ endif
 endif
 endif
 
-ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
 CCOMMON_OPT += -DF_INTERFACE_GFORT
+ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
@@ -1097,6 +1108,7 @@ EXTRALIB += -lgfortran
 endif
 endif
 endif
+endif
 ifdef NO_BINARY_MODE
 ifeq ($(ARCH), $(filter $(ARCH),mips64))
 ifdef BINARY64
@@ -1763,6 +1775,8 @@ export TARGET_CORE
 export NO_AVX512
 export NO_AVX2
 export BUILD_BFLOAT16
+export NO_LSX
+export NO_LASX
 
 export SBGEMM_UNROLL_M
 export SBGEMM_UNROLL_N
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 7ab331b1f..702447ace 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -75,18 +75,31 @@ endif
 ifeq ($(CORE), COOPERLAKE)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# cooperlake support was added in 10.1
-ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
-CCOMMON_OPT += -march=cooperlake
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=cooperlake
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # cooperlake support was added in 10.1
+ ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # cooperlake support was added in clang 9
+ ifeq ($(CLANGVERSIONGTEQ9), 1)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
@@ -104,18 +117,31 @@ endif
 ifeq ($(CORE), SAPPHIRERAPIDS)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# sapphire rapids support was added in 11
-ifeq ($(GCCVERSIONGTEQ11), 1)
-CCOMMON_OPT += -march=sapphirerapids
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=sapphirerapids
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # sapphire rapids support was added in 11
+ ifeq ($(GCCVERSIONGTEQ11), 1)
+  CCOMMON_OPT += -march=sapphirerapids
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=sapphirerapids
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # cooperlake support was added in clang 12
+ ifeq ($(CLANGVERSIONGTEQ12), 1)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
diff --git a/README.md b/README.md
index 6ce85e08e..081d45870 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,15 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
-Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
+Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
+<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
+
 
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 
+OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
 
+OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
 ## Introduction
 
 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 16b9da4f5..ff56ad00b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -115,7 +115,7 @@ jobs:
       mkdir build 
       cd build
       call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
       cmake --build . --config Release
       ctest
 
@@ -271,6 +271,19 @@ jobs:
   - script: |
      make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 
+- job: OSX_xbuild_DYNAMIC_ARM64
+  pool:
+     vmImage: 'macOS-11'
+  variables:
+     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
+  steps:
+  - script: |
+     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
+     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
+     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
 - job: ALPINE_MUSL
   pool:
      vmImage: 'ubuntu-latest'
diff --git a/benchmark/spr.c b/benchmark/spr.c
old mode 100755
new mode 100644
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
old mode 100755
new mode 100644
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index fa0f24666..e0ae58707 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
+Copyright (c) 2014, 2023 The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
   int step =   1;
   int loops =  1;
 
-  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
 
   double time1,timeg;
 
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
 
 
   if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
diff --git a/c_check b/c_check
index e8f90e18a..4d12c1674 100755
--- a/c_check
+++ b/c_check
@@ -31,13 +31,17 @@ flags="$*"
 
 cross_suffix=""
 
-if [ "`dirname \"$compiler_name\"`" != '.' ]; then
-    cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
+if [ "`dirname "$compiler_name"`" != '.' ]; then
+    cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
 fi
 
-bn=`basename $compiler_name`
+cn=`echo $compiler_name | sed -e 's/ -.*//'`
+bn=`basename "$cn"`
+
 case "$bn" in
-    *-*) cross_suffix="$cross_suffix${bn%-*}-"
+    *-*) if [ "$bn" != '-' ]; then
+           cross_suffix="$cross_suffix${bn%-*}-"
+         fi
 esac
 
 compiler=""
@@ -164,7 +168,7 @@ fi
 
 no_msa=0
 if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
-    tmpd="$(mktemp -d)"
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"addvi.b $w0, $w1, 1"'
     msa_flags='-mmsa -mfp64 -mload-store-pairs'
@@ -181,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
     rm -rf "$tmpd"
 fi
 
+no_lsx=0
+no_lasx=0
+if [ "$architecture" = "loongarch64" ]; then
+    tmpd="$(mktemp -d)"
+    tmplsx="$tmpd/lsx.c"
+    codelsx='"vadd.b $vr0, $vr0, $vr0"'
+    lsx_flags='-march=loongarch64 -mlsx'
+    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
+    args="$lsx_flags -o $tmplsx.o $tmplsx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lsx=1
+    }
+
+    tmplasx="$tmpd/lasx.c"
+    codelasx='"xvadd.b $xr0, $xr0, $xr0"'
+    lasx_flags='-march=loongarch64 -mlasx'
+    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
+    args="$lasx_flags -o $tmplasx.o $tmplasx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lasx=1
+    }
+
+    rm -rf "$tmpd"
+fi
+
 case "$data" in
     *ARCH_X86_64*) architecture=x86_64 ;;
     *ARCH_X86*) architecture=x86 ;;
@@ -204,7 +239,7 @@ esac
 
 no_avx512=0
 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
     printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -225,7 +260,7 @@ fi
 
 no_rv64gv=0
 if [ "$architecture" = "riscv64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vsetvli    zero, zero, e8, m1\n"'
     printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -241,13 +276,16 @@ fi
 
 no_sve=0
 if [ "$architecture" = "arm64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
     args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
     no_sve=0
     {
         $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
+        $compiler_name $flags $args >/dev/null 2>&1
     } || {
         no_sve=1
     }
@@ -257,7 +295,7 @@ fi
 c11_atomics=0
 case "$data" in
     *HAVE_C11*)
-        tmpd=`mktemp -d`
+        tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
         tmpf="$tmpd/a.c"
         printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
         args=" -c -o $tmpf.o $tmpf"
@@ -395,6 +433,8 @@ done
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
+    [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
+    [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
 } >> "$makefile"
 
 os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@@ -410,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
     [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
     [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
     [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
+    [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
+    [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
 } >> "$config"
 
 
diff --git a/c_check.pl b/c_check.pl
index 6ce28e11b..7a860a211 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     }
 }
 
+$no_lsx = 0;
+$no_lasx = 0;
+if (($architecture eq "loongarch64")) {
+    eval "use File::Temp qw(tempfile)";
+    if ($@){
+	warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
+    } else {
+	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
+	$lsx_flags = "-march=loongarch64 -mlsx";
+	print $tmplsx "#include <lsxintrin.h>\n\n";
+	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
+
+	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lsx = 1;
+	} else {
+	    $no_lsx = 0;
+	}
+	unlink("$tmplsx.o");
+
+	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
+	$lasx_flags = "-march=loongarch64 -mlasx";
+	print $tmplasx "#include <lasxintrin.h>\n\n";
+	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
+
+	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lasx = 1;
+	} else {
+	    $no_lasx = 0;
+	}
+	unlink("$tmplasx.o");
+    }
+}
+
 $architecture = x86          if ($data =~ /ARCH_X86/);
 $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
 $architecture = e2k          if ($data =~ /ARCH_E2K/);
@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
 print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
 print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
 print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
+print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
+print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n"  if $binformat eq bin64;
 print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
 print CONFFILE "#define HAVE_MSA\t1\n"  if $have_msa eq 1;
 print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
+print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
+print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
 
 
 if ($os eq "LINUX") {
diff --git a/cblas.h b/cblas.h
index c2bdd27fa..8a5055cf8 100644
--- a/cblas.h
+++ b/cblas.h
@@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
 void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                   OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
 
-void cblas_xerbla(blasint p, char *rout, char *form, ...);
+void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
 
 /*** BLAS extensions ***/
 
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 8521f3988..ebdc5a833 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
   if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
     endif ()
     if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
     endif ()
     if (NOT NO_AVX512)
-      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
+      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
       string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     endif ()
     if (DYNAMIC_LIST)
@@ -135,7 +135,7 @@ if (ARM64)
   set(BINARY_DEFINED 1)
 endif ()
 
-if (${ARCH} STREQUAL "riscv64")
+if (RISCV64)
   set(NO_BINARY_MODE 1)
   set(BINARY_DEFINED 1)
 endif ()
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 83b8d15ab..7b4ef8947 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
   endif ()
 endif ()
 
+if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
+  if (POWER)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
+  else ()
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
+  endif ()
+endif ()
+
 if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
   if (BINARY64)
     set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
@@ -172,22 +180,30 @@ endif ()
 
 if (${CORE} STREQUAL NEOVERSEN2)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
-    endif()
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+      else ()
+	set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
+    endif ()
   endif ()
 endif ()
 
 if (${CORE} STREQUAL NEOVERSEV1)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+      else ()
+        set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
     endif()
   endif ()
 endif ()
@@ -205,7 +221,11 @@ endif ()
 
 if (${CORE} STREQUAL ARMV8SVE)
   if (NOT DYNAMIC_ARCH)
-    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    endif ()
   endif ()
 endif ()
 
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index e615e148e..c496f6368 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -3,7 +3,8 @@
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets Fortran related variables.
 
-if (${F_COMPILER} STREQUAL "FLANG")
+if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+  # This is for classic Flang. LLVM Flang is handled with gfortran below.
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
   if (BINARY64 AND INTERFACE64)
     set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
@@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "GFORTRAN")
+if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
-  # ensure reentrancy of lapack codes
-  set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
-  # work around ABI violation in passing string arguments from C
-  set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
-  #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
-  if (NOT NO_LAPACK)
-    set(EXTRALIB "${EXTRALIB} -lgfortran")
+  if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    # ensure reentrancy of lapack codes
+    set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
+    # work around ABI violation in passing string arguments from C
+    set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
+    if (NOT NO_LAPACK)
+      # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
+      set(EXTRALIB "${EXTRALIB} -lgfortran")
+    endif ()
   endif ()
   if (NO_BINARY_MODE)
     if (MIPS64)
@@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
       endif ()
     endif ()
+    if (RISCV64)
+      if (BINARY64)
+        if (INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+        endif ()
+      endif ()
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "PGI")
+if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
   set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
   if (BINARY64)
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 45dda8686..5c6290484 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -124,7 +124,7 @@ set(SLASRC
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
    slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
-   slatrs3.f strsyl3.f sgelst.f)
+   slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -187,7 +187,7 @@ set(CLASRC
    cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
    cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
    cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
+   crot.f crscl.f cspcon.f csprfs.f cspsv.f
    cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
    cstegr.f cstein.f csteqr.f csycon.f
    csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
@@ -223,7 +223,7 @@ set(CLASRC
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
    cungtsqr.f cungtsqr_row.f cunhr_col.f 
-   clatrs3.f ctrsyl3.f cgelst.f)
+   clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -316,7 +316,7 @@ set(DLASRC
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
    dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
-   dlatrs3.f dtrsyl3.f dgelst.f)
+   dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -381,7 +381,7 @@ set(ZLASRC
    zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
    zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
    zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
    zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
    zstegr.f zstein.f zsteqr.f zsycon.f
    zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
@@ -419,7 +419,7 @@ set(ZLASRC
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlatrs3.f ztrsyl3.f zgelst.f)
+   zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@@ -436,6 +436,7 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
   DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
   DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
@@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
   DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
   DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.f)
 
@@ -622,7 +624,7 @@ set(SLASRC
    ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
    sgesvdq.c slaorhr_col_getrfnp.c
    slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
-   slatrs3.c strsyl3.c sgelst.c)
+   slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
 
 set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
    sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@@ -684,7 +686,7 @@ set(CLASRC
    cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
    cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
    cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
+   crot.c crscl.c  cspcon.c csprfs.c cspsv.c
    cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
    cstegr.c cstein.c csteqr.c csycon.c
    csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
@@ -720,7 +722,7 @@ set(CLASRC
    chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
    cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
    cungtsqr.c cungtsqr_row.c cunhr_col.c 
-   clatrs3.c ctrsyl3.c cgelst.c)
+   clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
 
 set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
    cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@@ -812,7 +814,7 @@ set(DLASRC
    dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
    dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
    dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
-   dlatrs3.c dtrsyl3.c dgelst.c)
+   dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
 
 set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
    dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@@ -876,7 +878,7 @@ set(ZLASRC
    zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
    zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
    zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zrot.c zrscl.c  zspcon.c zsprfs.c zspsv.c
    zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
    zstegr.c zstein.c zsteqr.c zsycon.c
    zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
@@ -913,7 +915,8 @@ set(ZLASRC
    zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
    zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
    zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
+   zgedmd.c zgedmdq.c)
 
 set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
@@ -930,6 +933,7 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
   DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
   DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
@@ -943,6 +947,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
   DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
   DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.c)
 
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 3a9352197..f43bf10d0 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -70,8 +70,6 @@ set(CSRC
   lapacke_cgeqlf_work.c
   lapacke_cgeqp3.c
   lapacke_cgeqp3_work.c
-  lapacke_cgeqpf.c
-  lapacke_cgeqpf_work.c
   lapacke_cgeqr.c
   lapacke_cgeqr_work.c
   lapacke_cgeqr2.c
@@ -92,6 +90,10 @@ set(CSRC
   lapacke_cgerqf_work.c
   lapacke_cgesdd.c
   lapacke_cgesdd_work.c
+  lapacke_cgedmd.c
+  lapacke_cgedmd_work.c
+  lapacke_cgedmdq.c
+  lapacke_cgedmdq_work.c
   lapacke_cgesv.c
   lapacke_cgesv_work.c
   lapacke_cgesvd.c
@@ -144,12 +146,8 @@ set(CSRC
   lapacke_cggqrf_work.c
   lapacke_cggrqf.c
   lapacke_cggrqf_work.c
-  lapacke_cggsvd.c
-  lapacke_cggsvd_work.c
   lapacke_cggsvd3.c
   lapacke_cggsvd3_work.c
-  lapacke_cggsvp.c
-  lapacke_cggsvp_work.c
   lapacke_cggsvp3.c
   lapacke_cggsvp3_work.c
   lapacke_cgtcon.c
@@ -564,6 +562,8 @@ set(CSRC
   lapacke_ctrsna_work.c
   lapacke_ctrsyl.c
   lapacke_ctrsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_ctrtri.c
   lapacke_ctrtri_work.c
   lapacke_ctrtrs.c
@@ -596,6 +596,8 @@ set(CSRC
   lapacke_cungtr_work.c
   lapacke_cungtsqr_row.c
   lapacke_cungtsqr_row_work.c
+  lapacke_cunhr_col.c
+  lapacke_cunhr_col_work.c
   lapacke_cunmbr.c
   lapacke_cunmbr_work.c
   lapacke_cunmhr.c
@@ -695,8 +697,6 @@ set(DSRC
   lapacke_dgeqlf_work.c
   lapacke_dgeqp3.c
   lapacke_dgeqp3_work.c
-  lapacke_dgeqpf.c
-  lapacke_dgeqpf_work.c
   lapacke_dgeqr.c
   lapacke_dgeqr_work.c
   lapacke_dgeqr2.c
@@ -717,6 +717,10 @@ set(DSRC
   lapacke_dgerqf_work.c
   lapacke_dgesdd.c
   lapacke_dgesdd_work.c
+  lapacke_dgedmd.c
+  lapacke_dgedmd_work.c
+  lapacke_dgedmdq.c
+  lapacke_dgedmdq_work.c
   lapacke_dgesv.c
   lapacke_dgesv_work.c
   lapacke_dgesvd.c
@@ -771,12 +775,8 @@ set(DSRC
   lapacke_dggqrf_work.c
   lapacke_dggrqf.c
   lapacke_dggrqf_work.c
-  lapacke_dggsvd.c
-  lapacke_dggsvd_work.c
   lapacke_dggsvd3.c
   lapacke_dggsvd3_work.c
-  lapacke_dggsvp.c
-  lapacke_dggsvp_work.c
   lapacke_dggsvp3.c
   lapacke_dggsvp3_work.c
   lapacke_dgtcon.c
@@ -874,6 +874,8 @@ set(DSRC
   lapacke_dorgtr_work.c
   lapacke_dorgtsqr_row.c
   lapacke_dorgtsqr_row_work.c
+  lapacke_dorhr_col.c
+  lapacke_dorhr_col_work.c
   lapacke_dormbr.c
   lapacke_dormbr_work.c
   lapacke_dormhr.c
@@ -1186,6 +1188,8 @@ set(DSRC
   lapacke_dtrsna_work.c
   lapacke_dtrsyl.c
   lapacke_dtrsyl_work.c
+  lapacke_dtrsyl3.c
+  lapacke_dtrsyl3_work.c
   lapacke_dtrtri.c
   lapacke_dtrtri_work.c
   lapacke_dtrtrs.c
@@ -1275,8 +1279,6 @@ set(SSRC
   lapacke_sgeqlf_work.c
   lapacke_sgeqp3.c
   lapacke_sgeqp3_work.c
-  lapacke_sgeqpf.c
-  lapacke_sgeqpf_work.c
   lapacke_sgeqr.c
   lapacke_sgeqr_work.c
   lapacke_sgeqr2.c
@@ -1297,6 +1299,10 @@ set(SSRC
   lapacke_sgerqf_work.c
   lapacke_sgesdd.c
   lapacke_sgesdd_work.c
+  lapacke_sgedmd.c
+  lapacke_sgedmd_work.c
+  lapacke_sgedmdq.c
+  lapacke_sgedmdq_work.c
   lapacke_sgesv.c
   lapacke_sgesv_work.c
   lapacke_sgesvd.c
@@ -1351,12 +1357,8 @@ set(SSRC
   lapacke_sggqrf_work.c
   lapacke_sggrqf.c
   lapacke_sggrqf_work.c
-  lapacke_sggsvd.c
-  lapacke_sggsvd_work.c
   lapacke_sggsvd3.c
   lapacke_sggsvd3_work.c
-  lapacke_sggsvp.c
-  lapacke_sggsvp_work.c
   lapacke_sggsvp3.c
   lapacke_sggsvp3_work.c
   lapacke_sgtcon.c
@@ -1453,6 +1455,8 @@ set(SSRC
   lapacke_sorgtr_work.c
   lapacke_sorgtsqr_row.c
   lapacke_sorgtsqr_row_work.c
+  lapacke_sorhr_col.c
+  lapacke_sorhr_col_work.c
   lapacke_sormbr.c
   lapacke_sormbr_work.c
   lapacke_sormhr.c
@@ -1762,6 +1766,8 @@ set(SSRC
   lapacke_strsna_work.c
   lapacke_strsyl.c
   lapacke_strsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_strtri.c
   lapacke_strtri_work.c
   lapacke_strtrs.c
@@ -1849,8 +1855,6 @@ set(ZSRC
   lapacke_zgeqlf_work.c
   lapacke_zgeqp3.c
   lapacke_zgeqp3_work.c
-  lapacke_zgeqpf.c
-  lapacke_zgeqpf_work.c
   lapacke_zgeqr.c
   lapacke_zgeqr_work.c
   lapacke_zgeqr2.c
@@ -1871,6 +1875,10 @@ set(ZSRC
   lapacke_zgerqf_work.c
   lapacke_zgesdd.c
   lapacke_zgesdd_work.c
+  lapacke_zgedmd.c
+  lapacke_zgedmd_work.c
+  lapacke_zgedmdq.c
+  lapacke_zgedmdq_work.c
   lapacke_zgesv.c
   lapacke_zgesv_work.c
   lapacke_zgesvd.c
@@ -1925,12 +1933,8 @@ set(ZSRC
   lapacke_zggqrf_work.c
   lapacke_zggrqf.c
   lapacke_zggrqf_work.c
-  lapacke_zggsvd.c
-  lapacke_zggsvd_work.c
   lapacke_zggsvd3.c
   lapacke_zggsvd3_work.c
-  lapacke_zggsvp.c
-  lapacke_zggsvp_work.c
   lapacke_zggsvp3.c
   lapacke_zggsvp3_work.c
   lapacke_zgtcon.c
@@ -2343,6 +2347,8 @@ set(ZSRC
   lapacke_ztrsna_work.c
   lapacke_ztrsyl.c
   lapacke_ztrsyl_work.c
+  lapacke_ztrsyl3.c
+  lapacke_ztrsyl3_work.c
   lapacke_ztrtri.c
   lapacke_ztrtri_work.c
   lapacke_ztrtrs.c
@@ -2375,6 +2381,8 @@ set(ZSRC
   lapacke_zungtr_work.c
   lapacke_zungtsqr_row.c
   lapacke_zungtsqr_row_work.c
+  lapacke_zunhr_col.c
+  lapacke_zunhr_col_work.c
   lapacke_zunmbr.c
   lapacke_zunmbr_work.c
   lapacke_zunmhr.c
@@ -2401,6 +2409,12 @@ set(ZSRC
   lapacke_csyr_work.c
   lapacke_ilaver.c
 )
+if (BUILD_LAPACK_DEPRECATED)
+set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
+set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
+set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
+set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
+endif()
 
 set(SRCX
   lapacke_cgbrfsx.c       lapacke_cporfsx.c       lapacke_dgerfsx.c       lapacke_sgbrfsx.c       lapacke_ssyrfsx.c       lapacke_zherfsx.c
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 631e7fe69..bc87f7b44 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -55,7 +55,7 @@ if (DEFINED TARGET)
 endif ()
 
 # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
-if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
+if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
   set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
 endif ()
 
@@ -280,7 +280,41 @@ if (DEFINED TARGET)
   if (${TARGET} STREQUAL POWER8)
     set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
   endif()
+
+if (${TARGET} STREQUAL NEOVERSEV1)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
+    endif()
+    endif()
+  endif()
+  if (${TARGET} STREQUAL NEOVERSEN2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
+    endif()
+    endif()
+  endif()
+  if (${TARGET} STREQUAL ARMV8SVE)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
+    else ()
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
+    endif()
+  endif()
+
 endif()
+
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index c59e85d54..49b9863e3 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
   set(LOONGARCH64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
+  set(RISCV64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
   if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
     set(ARM64 1)
   else()
@@ -107,7 +109,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
+  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
     set(BINARY 64)
   else ()
     set(BINARY 32)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 56c1cb060..5e8ba866b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
       #message(STATUS "skipping ${makefile_line}")
       continue ()
     endif ()
+
+    # Example 1: SBGEMM_SMALL_M_PERMIT =
+    # Unset the variable
+    string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      set(var_name ${CMAKE_MATCH_1})
+      unset(${var_name})
+    endif()
+
     string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
     if (NOT "${line_match}" STREQUAL "")
       #message(STATUS "match on ${line_match}")
diff --git a/common.h b/common.h
index 4eeeb8d55..4074df069 100644
--- a/common.h
+++ b/common.h
@@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){
 #endif // !RPCC_DEFINED
 
 #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   do {
     while (*address) {YIELDING;};
diff --git a/common_alpha.h b/common_alpha.h
index 021eb93ae..e5380454a 100644
--- a/common_alpha.h
+++ b/common_alpha.h
@@ -45,7 +45,7 @@
 #define WMB asm("wmb")
 #define RMB asm("mb")
 
-static void __inline blas_lock(unsigned long *address){
+static __inline void blas_lock(unsigned long *address){
 #ifndef __DECC
   unsigned long tmp1, tmp2;
   asm volatile(
diff --git a/common_arm.h b/common_arm.h
index 682315de5..a3db9953c 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int register ret;
 
diff --git a/common_arm64.h b/common_arm64.h
index 6a18a294c..436ccb8f5 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
diff --git a/common_loongarch64.h b/common_loongarch64.h
index e15539b5f..ce1fcf091 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
   return x / y;
 }
 
+#ifndef NO_AFFINITY
+static inline int WhereAmI(void){
+  int ret = 0, counter = 0;
+  __asm__ volatile (
+    "rdtimel.w  %[counter],   %[id]"
+    : [id]"=r"(ret), [counter]"=r"(counter)
+    :
+    : "memory"
+  );
+  return ret;
+}
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
 #else
diff --git a/common_param.h b/common_param.h
index 1854570f1..c082d248e 100644
--- a/common_param.h
+++ b/common_param.h
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -45,12 +46,14 @@
 
 typedef struct {
   int dtb_entries;
+  int switch_ratio;
   int offsetA, offsetB, align;
 
 #if BUILD_BFLOAT16 == 1
   int sbgemm_p, sbgemm_q, sbgemm_r;
   int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
   int sbgemm_align_k;
+  int need_amxtile_permission;  // 0 default, 1 for device support amx.
 
   void   (*sbstobf16_k) (BLASLONG, float    *, BLASLONG, bfloat16 *, BLASLONG);
   void   (*sbdtobf16_k) (BLASLONG, double   *, BLASLONG, bfloat16 *, BLASLONG);
diff --git a/common_power.h b/common_power.h
index a49197fd7..3fe776f23 100644
--- a/common_power.h
+++ b/common_power.h
@@ -91,7 +91,7 @@
 
 void *qalloc(int flags, size_t bytes);
 
-static void INLINE blas_lock(volatile unsigned long *address){
+static INLINE void blas_lock(volatile unsigned long *address){
 
   long int ret, val = 1;
 
diff --git a/common_sparc.h b/common_sparc.h
index 90a24ebf1..4b9e7840a 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -45,7 +45,7 @@
 
 #ifndef ASSEMBLER
 
-static void __inline blas_lock(volatile unsigned long *address){
+static __inline void blas_lock(volatile unsigned long *address){
 
   long int ret = 1;
 
diff --git a/common_thread.h b/common_thread.h
index 05e1d5489..06a7a1a38 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
 /* Global Parameter */
 extern int blas_cpu_number;
 extern int blas_num_threads;
-extern int blas_num_threads_set;
 extern int blas_omp_linked;
 
 #define BLAS_LEGACY	0x8000U
@@ -136,15 +135,13 @@ typedef struct blas_queue {
 #ifdef SMP_SERVER
 
 extern int blas_server_avail;
+extern int blas_omp_number_max;
 
 static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
-	if (blas_num_threads_set == 0)
 	openmp_nthreads=omp_get_max_threads();
-	else
-	openmp_nthreads=blas_cpu_number;
 #endif
 
 #ifndef USE_OPENMP 
@@ -156,7 +153,13 @@ int openmp_nthreads;
       ) return 1;        
 
 #ifdef USE_OPENMP
-  if (blas_cpu_number != openmp_nthreads) {
+     if (openmp_nthreads > blas_omp_number_max){
+#ifdef DEBUG
+     fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
+#endif
+     openmp_nthreads = blas_omp_number_max;
+     }
+     if (blas_cpu_number != openmp_nthreads) {
 	  goto_set_num_threads(openmp_nthreads);
   }
 #endif
diff --git a/common_x86.h b/common_x86.h
index bc77eca58..65fb9a460 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -54,7 +54,7 @@
 #define	__volatile__
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int ret;
 
diff --git a/common_x86_64.h b/common_x86_64.h
index 729a055ce..dda168d6c 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -70,7 +70,7 @@
 #define RMB
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 	
 #ifndef C_MSVC
diff --git a/common_zarch.h b/common_zarch.h
index 442bae821..80609251b 100644
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
   /*
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 1080ea974..e586f9a3c 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -267,8 +267,9 @@ int detect(void)
 	}
 #else
 #ifdef __APPLE__
-	sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
-	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
+	sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
+	if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
+	if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
 #endif
 	return CPU_ARMV8;	
 #endif
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index ca07c7ffb..7c389db27 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
 #include <stdint.h>
+#include <sys/auxv.h>
 
 /*  If LASX extension instructions supported,
  *  using core LOONGSON3R5
@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LOONGARCH_CFG2  0x02
-#define LOONGARCH_LASX  1<<7
-#define LOONGARCH_LSX   1<<6
+#define LA_HWCAP_LSX    (1<<4)
+#define LA_HWCAP_LASX   (1<<5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  uint32_t reg = 0;
+  int flag  = (int)getauxval(AT_HWCAP);
 
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(reg)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (reg & LOONGARCH_LASX)
+  if (flag & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (reg & LOONGARCH_LSX)
+  else if (flag & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;
diff --git a/cpuid_x86.c b/cpuid_x86.c
index ad13a8c8c..c485f3ddf 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1479,6 +1479,8 @@ int get_cpuname(void){
           else
 	    return CPUTYPE_NEHALEM;
 	case 15: // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CPUTYPE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
             return CPUTYPE_COOPERLAKE;	
           if(support_avx512())
@@ -1549,6 +1551,7 @@ int get_cpuname(void){
           case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N
             if(support_avx2())
               return CPUTYPE_HASWELL;
 	    if(support_avx())
@@ -1845,7 +1848,8 @@ static char *cpuname[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *lowercpuname[] = {
@@ -1902,7 +1906,8 @@ static char *lowercpuname[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 static char *corename[] = {
@@ -1936,7 +1941,8 @@ static char *corename[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *corename_lower[] = {
@@ -1970,7 +1976,8 @@ static char *corename_lower[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 
@@ -2276,16 +2283,18 @@ int get_coretype(void){
             return CORE_NEHALEM;
 	}
         if (model == 15) { // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CORE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
-            return CPUTYPE_COOPERLAKE;	
+            return CORE_COOPERLAKE;	
           if(support_avx512())
-            return CPUTYPE_SKYLAKEX;
+            return CORE_SKYLAKEX;
           if(support_avx2())
-            return CPUTYPE_HASWELL;
+            return CORE_HASWELL;
           if(support_avx())
-	    return CPUTYPE_SANDYBRIDGE;
+	    return CORE_SANDYBRIDGE;
 	  else
-	  return CPUTYPE_NEHALEM;	
+	  return CORE_NEHALEM;
         }
       break;
 
@@ -2352,6 +2361,7 @@ int get_coretype(void){
 	  case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N	  
 #ifndef NO_AVX2
 	  if(support_avx2())
             return CORE_HASWELL;
diff --git a/ctest/Makefile b/ctest/Makefile
index 0fb2450d2..9e85d23b9 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)
diff --git a/docs/distributing.md b/docs/distributing.md
new file mode 100644
index 000000000..1e6372a28
--- /dev/null
+++ b/docs/distributing.md
@@ -0,0 +1,270 @@
+# Guidance for redistributing OpenBLAS
+
+*We note that this document contains recommendations only - packagers and other
+redistributors are in charge of how OpenBLAS is built and distributed in their
+systems, and may have good reasons to deviate from the guidance given on this
+page. These recommendations are aimed at general packaging systems, with a user
+base that typically is large, open source (or freely available at least), and
+doesn't behave uniformly or that the packager is directly connected with.*
+
+OpenBLAS has a large number of build-time options which can be used to change
+how it behaves at runtime, how artifacts or symbols are named, etc. Variation
+in build configuration can be necessary to acheive a given end goal within a
+distribution or as an end user. However, such variation can also make it more
+difficult to build on top of OpenBLAS and ship code or other packages in a way
+that works across many different distros. Here we provide guidance about the
+most important build options, what effects they may have when changed, and
+which ones to default to.
+
+The Make and CMake build systems provide equivalent options and yield more or
+less the same artifacts, but not exactly (the CMake builds are still
+experimental). You can choose either one and the options will function in the
+same way, however the CMake outputs may require some renaming. To review
+available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
+the repository.
+
+Build options typically fall into two categories: (a) options that affect the
+user interface, such as library and symbol names or APIs that are made
+available, and (b) options that affect performance and runtime behavior, such
+as threading behavior or CPU architecture-specific code paths. The user
+interface options are more important to keep aligned between distributions,
+while for the performance-related options there are typically more reasons to
+make choices that deviate from the defaults.
+
+Here are recommendations for user interface related packaging choices where it
+is not likely to be a good idea to deviate (typically these are the default
+settings):
+
+1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
+   binary size much, so don't turn it off.
+2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
+   while it does make up a significant part of the binary size of the installed
+   library, that does not outweigh the regression in usability when deviating
+   from the default here.[^1]
+3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
+   detection files. These files are used by build systems when users want to
+   link against OpenBLAS, and there is no benefit of leaving them out.
+4. Provide the LP64 interface by default, and if in addition to that you choose
+   to provide an ILP64 interface build as well, use a symbol suffix to avoid
+   symbol name clashes (see the next section).
+
+[^1] All major distributions do include LAPACK as of mid 2023 as far as we
+know. Older versions of Arch Linux did not, and that was known to cause
+problems.
+
+
+## ILP64 interface builds
+
+The LP64 (32-bit integer) interface is the default build, and has
+well-established C and Fortran APIs as determined by the reference (Netlib)
+BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
+not have a standard API: symbol names and shared/static library names can be
+produced in multiple ways, and this tends to make it difficult to use.
+As of today there is an agreed-upon way of choosing names for OpenBLAS between
+a number of key users/redistributors, which is the closest thing to a standard
+that there is now. However, there is an ongoing standardization effort in the
+reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
+agreed-upon convention. In this section we'll aim to explain both.
+
+Those two methods are fairly similar, and have a key thing in common: *using a
+symbol suffix*. This is good practice; it is recommended that if you distribute
+an ILP64 build, to have it use a symbol suffix containing `64` in the name.
+This avoids potential symbol clashes when different packages which depend on
+OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
+
+### The current OpenBLAS agreed-upon ILP64 convention
+
+This convention comprises the shared library name and the symbol suffix in the
+shared library. The symbol suffix to use is `64_`, implying that the library
+name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
+The central issue where this was discussed is
+[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
+include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
+
+To build shared and static libraries with the currently recommended ILP64
+conventions with Make:
+```bash
+$ make INTERFACE64=1 SYMBOLSUFFIX=64_
+```
+
+This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
+named `openblas64.pc`, and CMake and header files.
+
+Installing locally and inspecting the output will show a few more details:
+```bash
+$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
+$ tree .  # output slightly edited down
+.
+├── include
+│   ├── cblas.h
+│   ├── f77blas.h
+│   ├── lapacke_config.h
+│   ├── lapacke.h
+│   ├── lapacke_mangling.h
+│   ├── lapacke_utils.h
+│   ├── lapack.h
+│   └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── openblas
+    │       ├── OpenBLASConfig.cmake
+    │       └── OpenBLASConfigVersion.cmake
+    ├── libopenblas64_.a
+    ├── libopenblas64_.so
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+A key point are the symbol names. These will equal the LP64 symbol names, then
+(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
+Hence to obtain the final symbol names, we need to take into account which
+Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
+Fortran, or Flang), that means appending a single underscore. In that case, the
+result is:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm64_`   | n/a                    | `cblas_dgemm64_(...)` |
+
+It is quite useful to have these symbol names be as uniform as possible across
+different packaging systems.
+
+The equivalent build options with CMake are:
+```bash
+$ mkdir build && cd build
+$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
+$ cmake --build . -j
+```
+
+Note that the result is not 100% identical to the Make result. For example, the
+library name ends in `_64` rather than `64_` - it is recommended to rename them
+to match the Make library names (also update the `libsuffix` entry in
+`openblas64.pc` to match that rename).
+```bash
+$ cmake --install . --prefix $PWD/../../openblas/cmake64
+$ tree .
+.
+├── include
+│   └── openblas64
+│       ├── cblas.h
+│       ├── f77blas.h
+│       ├── lapacke_config.h
+│       ├── lapacke_example_aux.h
+│       ├── lapacke.h
+│       ├── lapacke_mangling.h
+│       ├── lapacke_utils.h
+│       ├── lapack.h
+│       ├── openblas64
+│       │   └── lapacke_mangling.h
+│       └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── OpenBLAS64
+    │       ├── OpenBLAS64Config.cmake
+    │       ├── OpenBLAS64ConfigVersion.cmake
+    │       ├── OpenBLAS64Targets.cmake
+    │       └── OpenBLAS64Targets-noconfig.cmake
+    ├── libopenblas_64.a
+    ├── libopenblas_64.so -> libopenblas_64.so.0
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+
+### The upcoming standardized ILP64 convention
+
+While the `64_` convention above got some adoption, it's slightly hacky and is
+implemented through the use of `objcopy`. An effort is ongoing for a more
+broadly adopted convention in the reference BLAS and LAPACK libraries, using
+(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
+Fortran compiler mangling. The central issue for this is
+[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
+
+For the most common cases of compiler mangling (a single `_` appended), the end
+result will be:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm_64`   | n/a                    | `cblas_dgemm_64(...)` |
+
+For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
+
+The shared library name for this `_64` convention should be `libopenblas_64.so`.
+
+Note: it is not yet possible to produce an OpenBLAS build which employs this
+convention! Once reference BLAS and LAPACK with support for `_64` have been
+released, a future OpenBLAS release will support it. For now, please use the
+older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
+considered reserved for future use of the `_64` standard as prescribed by
+reference BLAS/LAPACK.
+
+
+## Performance and runtime behavior related build options
+
+For these options there are multiple reasonable or common choices.
+
+### Threading related options
+
+OpenBLAS can be built as a multi-threaded or single-threaded library, with the
+default being multi-threaded. It's expected that the default `libopenblas`
+library is multi-threaded; if you'd like to also distribute single-threaded
+builds, consider naming them `libopenblas_sequential`.
+
+OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
+default being pthreads. Both options are commonly used, and the choice here
+should not influence the shared library name. The choice will be captured by
+the `.pc` file. E.g.,:
+```bash
+$ pkg-config --libs openblas
+-fopenmp -lopenblas
+
+$ cat openblas.pc
+...
+openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
+```
+
+The maximum number of threads users will be able to use is determined at build
+time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
+range of values that are reasonable to use (up to 256). 64 is a typical choice
+here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
+Please see `Makefile.rule` for more details.
+
+### CPU architecture related options
+
+OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
+distributing to a user base with a variety of hardware, it is recommended to
+enable CPU architecture runtime detection. This will dynamically select
+optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
+build option. This is usually done on all common CPU families, except when
+there are known issues.
+
+In case the CPU architecture is known (e.g. you're building binaries for macOS
+M1 users), it is possible to specify the target architecture directly with the
+`TARGET=` build option.
+
+`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
+in this repository.
+
+
+## Real-world examples
+
+OpenBLAS is likely to be distributed in one of these distribution models:
+
+1. As a standalone package, or multiple packages, in a packaging ecosystem like
+   a Linux distro, Homebrew, conda-forge or MSYS2.
+2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
+3. Locally, e.g. making available as a build on a single HPC cluster.
+
+The guidance on this page is most important for models (1) and (2). These links
+to build recipes for a representative selection of packaging systems may be
+helpful as a reference:
+
+- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
+- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
+- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
+- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
+- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
+- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
+- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)
diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 39824fc5a..26d07fa94 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   BLASLONG divN, divT;
   int mode;
 
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
   if (range_m) {
     BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
     BLASLONG m_to   = *(((BLASLONG *)range_m) + 1);
@@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   }
 */
 
-  if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
+  if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
     GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
@@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   divT = nthreads;
   divN = 1;
 
-  while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
+  while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
     do {
       divT --;
       divN = 1;
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index d7dcd68a3..b03577fb3 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   int  mode, mask;
   double dnum, di, dinum;
 
-  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
+  if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
     SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 02b60b50d..c9ecf73e8 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef GEMM_PREFERED_SIZE
 #define GEMM_PREFERED_SIZE 1
 #endif
@@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
   BLASLONG width, i, j, k, js;
   BLASLONG m, n, n_from, n_to;
   int mode;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get execution mode */
 #ifndef COMPLEX
@@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
     num_parts  = 0;
     while (n > 0){
       width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
-      if (width < SWITCH_RATIO) {
-        width = SWITCH_RATIO;
+      if (width < switch_ratio) {
+        width = switch_ratio;
       }
       width = round_up(n, width, GEMM_PREFERED_SIZE);
 
@@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
   BLASLONG m = args -> m;
   BLASLONG n = args -> n;
   BLASLONG nthreads_m, nthreads_n;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get dimensions from index ranges if available */
   if (range_m) {
@@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
     n = range_n[1] - range_n[0];
   }
 
-  /* Partitions in m should have at least SWITCH_RATIO rows */
-  if (m < 2 * SWITCH_RATIO) {
+  /* Partitions in m should have at least switch_ratio rows */
+  if (m < 2 * switch_ratio) {
     nthreads_m = 1;
   } else {
     nthreads_m = args -> nthreads;
-    while (m < nthreads_m * SWITCH_RATIO) {
+    while (m < nthreads_m * switch_ratio) {
       nthreads_m = nthreads_m / 2;
     }
   }
 
-  /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
-  if (n < SWITCH_RATIO * nthreads_m) {
+  /* Partitions in n should have at most switch_ratio * nthreads_m columns */
+  if (n < switch_ratio * nthreads_m) {
     nthreads_n = 1;
   } else {
-    nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
+    nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
     if (nthreads_m * nthreads_n > args -> nthreads) {
       nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
     }
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 051513f27..a8a84acbb 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
 
     increased_threads = 1;
 
-    for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+    for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
 
       atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
       thread_status[i].status = THREAD_STATUS_WAKEUP;
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 2e0c0f38c..fe6b4a7c0 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -68,6 +68,7 @@
 #endif
 
 int blas_server_avail = 0;
+int blas_omp_number_max = 0;
 
 extern int openblas_omp_adaptive_env();
 
@@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
 
 void goto_set_num_threads(int num_threads) {
 
-  blas_num_threads_set = 1;
-  if (num_threads < 0) blas_num_threads_set = 0;
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
 }
 
 int blas_thread_init(void){
+if(blas_omp_number_max <= 0)
+  blas_omp_number_max = omp_get_max_threads();
 
   blas_get_cpu_number();
 
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index afa33cccc..5bdfc1276 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
 			blas_server_avail = 1;
 		}
 
-		for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index f61930983..8e0f53f74 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
 #else
 #define gotoblas_COOPERLAKE gotoblas_PRESCOTT
 #endif
+#ifdef DYN_SAPPHIRERAPIDS
+extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
+#elif defined(DYN_SKYLAKEX)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
+#elif defined(DYN_HASWELL)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
+#else
+#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
+#endif
 
 
 #else // not DYNAMIC_LIST
@@ -268,9 +281,11 @@ extern gotoblas_t  gotoblas_ZEN;
 #ifndef NO_AVX512
 extern gotoblas_t  gotoblas_SKYLAKEX;
 extern gotoblas_t  gotoblas_COOPERLAKE;
+extern gotoblas_t  gotoblas_SAPPHIRERAPIDS;
 #else
 #define gotoblas_SKYLAKEX gotoblas_HASWELL
 #define gotoblas_COOPERLAKE gotoblas_HASWELL
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
 #endif
 #endif
 #else
@@ -279,6 +294,7 @@ extern gotoblas_t  gotoblas_COOPERLAKE;
 #define gotoblas_HASWELL gotoblas_NEHALEM
 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@@ -378,6 +394,31 @@ int support_avx512_bf16(){
 #endif
 }
 
+#define BIT_AMX_TILE	0x01000000
+#define BIT_AMX_BF16	0x00400000
+#define BIT_AMX_ENBD	0x00060000
+
+int support_amx_bf16() {
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx512())
+    return 0;
+  // CPUID.7.0:EDX indicates AMX support
+  cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+  if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
+    // CPUID.D.0:EAX[17:18] indicates AMX enabled
+    cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+    if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
+      ret = 1;
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
 	  }
 	}
 	if (model == 15){          // Sapphire Rapids
+	   if(support_amx_bf16())
+	     return &gotoblas_SAPPHIRERAPIDS;
 	   if(support_avx512_bf16())
              return &gotoblas_COOPERLAKE;
           if (support_avx512()) 
@@ -941,7 +984,8 @@ static char *corename[] = {
     "Excavator",
     "Zen",
     "SkylakeX",
-    "Cooperlake"
+    "Cooperlake",
+    "SapphireRapids"
 };
 
 char *gotoblas_corename(void) {
@@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_ZEN)          return corename[23];
   if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
   if (gotoblas == &gotoblas_COOPERLAKE)   return corename[25];
+  if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
   return corename[0];
 }
 
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 0f47b287c..530d18115 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
 #else
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
 #endif
+#ifdef DYN_ARMV8SVE
+extern gotoblas_t gotoblas_ARMV8SVE;
+#else
+#define gotoblas_ARMV8SVE gotoblas_ARMV8
+#endif
 #ifdef DYN_CORTEX_A55
 extern gotoblas_t  gotoblas_CORTEXA55;
 #else
@@ -128,17 +134,21 @@ extern gotoblas_t  gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
 extern gotoblas_t  gotoblas_NEOVERSEV1;
 extern gotoblas_t  gotoblas_NEOVERSEN2;
+extern gotoblas_t  gotoblas_ARMV8SVE;
 #else
 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
+#define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
 extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 
 extern void openblas_warning(int verbose, const char * msg);
+#define FALLBACK_VERBOSE 1
+#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
 
-#define NUM_CORETYPES   13
+#define NUM_CORETYPES   16
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
 #ifndef HWCAP_CPUID
 #define HWCAP_CPUID (1 << 11)
 #endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
 
 #define get_cpu_ftr(id, var) ({					\
 		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
@@ -168,6 +181,7 @@ static char *corename[] = {
   "neoversen2",
   "thunderx3t110",
   "cortexa55",
+  "armv8sve",
   "unknown"
 };
 
@@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[12];
   if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
   if (gotoblas == &gotoblas_CORTEXA55)    return corename[14];
+  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[15];
   return corename[NUM_CORETYPES];
 }
 
@@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
     case 12: return (&gotoblas_NEOVERSEN2);
     case 13: return (&gotoblas_THUNDERX3T110);
     case 14: return (&gotoblas_CORTEXA55);
+    case 15: return (&gotoblas_ARMV8SVE);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
         case 0xd49:
-          return &gotoblas_NEOVERSEN2;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+	    openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);  
+	    return &gotoblas_NEOVERSEN1;
+	  } else
+            return &gotoblas_NEOVERSEN2;
 	case 0xd40:
-	  return &gotoblas_NEOVERSEV1;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+		  openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
+	    return &gotoblas_NEOVERSEN1;
+      	  }else
+	    return &gotoblas_NEOVERSEV1;
 #endif
 	case 0xd05: // Cortex A55
 	  return &gotoblas_CORTEXA55;
@@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);
   }
+#ifndef NO_SVE
+  if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
+    return &gotoblas_ARMV8SVE;
+  }
+#endif
+
   return NULL;
 #endif
 }
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 4493b7d71..4fceae754 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
@@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
@@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){
 #endif
  if (memory_overflowed) goto terminate;
   fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
+  fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
+  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
   memory_overflowed=1;
   new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
   newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 0b38d1887..6174d9b75 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
 This value is equal or large than blas_cpu_number. This means some threads are sleep.
 */
 int blas_num_threads = 0;
-int blas_num_threads_set = 0;
 
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
diff --git a/exports/gensymbol b/exports/gensymbol
index f05de626f..704eab06f 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -21,7 +21,7 @@ blasobjsc="
     chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
     chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
     csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
-    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
+    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
 
 blasobjsd="
     damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
@@ -29,7 +29,7 @@ blasobjsd="
     dscal dsdot dspmv dspr2 dimatcopy domatcopy
     dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
     dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
-        idamax idamin idmax idmin dgeadd dsum"
+        idamax idamin idmax idmin dgeadd dsum dgemmt"
 
 blasobjss="
     isamax isamin ismax ismin
@@ -38,7 +38,7 @@ blasobjss="
     smax smin snrm2 simatcopy somatcopy
     srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
     ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
-    strmm strmv strsm strsv  sgeadd ssum"
+    strmm strmv strsm strsv  sgeadd ssum sgemmt"
 
 blasobjsz="
     izamax izamin
@@ -48,7 +48,7 @@ blasobjsz="
     zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
     ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
     zomatcopy  zimatcopy dzamax dzamin dzasum dznrm2
-    zgeadd  dzsum"
+    zgeadd  dzsum zgemmt"
 
 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
@@ -58,7 +58,7 @@ cblasobjsc="
     cblas_cher cblas_cherk  cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
     cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
-    cblas_scnrm2 cblas_scasum
+    cblas_scnrm2 cblas_scasum cblas_cgemmt
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
     "
 cblasobjsd="
@@ -67,7 +67,7 @@ cblasobjsd="
     cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
     cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
-    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
+    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
     cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
     "
 
@@ -78,7 +78,7 @@ cblasobjss="
     cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
     cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
-    cblas_strsv cblas_sgeadd
+    cblas_strsv cblas_sgeadd cblas_sgemmt
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
     "
 
@@ -89,7 +89,7 @@ cblasobjsz="
     cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
     cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
-    cblas_zaxpby cblas_zgeadd
+    cblas_zaxpby cblas_zgeadd cblas_zgemmt
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
 "
 
@@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z
 # functions added for lapack-3.7.0
 lapackobjs2s="$lapackobjs2s
     slarfy
+    ssyconvf
     strevc3
     sgelqt
     sgelqt3
@@ -843,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
     zungtsqr_row
     "
 
+#functions added for lapack-3.11
+lapackobjs2c="$lapackobjs2c
+    cgedmd
+    cgedmdq
+    "
+lapackobjs2d="$lapackobjs2d
+    dgedmd
+    dgedmdq
+    "
+lapackobjs2s="$lapackobjs2s
+    sgedmd
+    sgedmdq
+    "
+lapackobjs2z="$lapackobjs2z
+    zgedmd
+    zgedmdq
+    "
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
     dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@@ -1012,6 +1030,10 @@ lapackeobjsc="
     LAPACKE_cgebrd_work
     LAPACKE_cgecon
     LAPACKE_cgecon_work
+    LAPACKE_cgedmd
+    LAPACKE_cgedmd_work
+    LAPACKE_cgedmdq
+    LAPACKE_cgedmdq_work
     LAPACKE_cgeequ
     LAPACKE_cgeequ_work
     LAPACKE_cgeequb
@@ -1671,6 +1693,10 @@ lapackeobjsd="
     LAPACKE_dgebrd_work
     LAPACKE_dgecon
     LAPACKE_dgecon_work
+    LAPACKE_dgedmd
+    LAPACKE_dgedmd_work
+    LAPACKE_dgedmdq
+    LAPACKE_dgedmdq_work
     LAPACKE_dgeequ
     LAPACKE_dgeequ_work
     LAPACKE_dgeequb
@@ -2284,6 +2310,10 @@ lapackeobjss="
     LAPACKE_sgebrd_work
     LAPACKE_sgecon
     LAPACKE_sgecon_work
+    LAPACKE_sgedmd
+    LAPACKE_sgedmd_work
+    LAPACKE_sgedmdq
+    LAPACKE_sgedmdq_work
     LAPACKE_sgeequ
     LAPACKE_sgeequ_work
     LAPACKE_sgeequb
@@ -2893,6 +2923,10 @@ lapackeobjsz="
     LAPACKE_zgebrd_work
     LAPACKE_zgecon
     LAPACKE_zgecon_work
+    LAPACKE_zgedmd
+    LAPACKE_zgedmd_work
+    LAPACKE_zgedmdq
+    LAPACKE_zgedmdq_work
     LAPACKE_zgeequ
     LAPACKE_zgeequ_work
     LAPACKE_zgeequb
diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl
index e38a3cc89..dd79e924d 100644
--- a/exports/gensymbol.pl
+++ b/exports/gensymbol.pl
@@ -21,7 +21,7 @@
     chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
     chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
     csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
-    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
+    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
     
 @blasobjsd = (
     damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@@ -29,7 +29,7 @@
     dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
     dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
     dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
-        idamax,idamin,idmax,idmin,dgeadd,dsum);
+        idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
     
 @blasobjss = (
     isamax,isamin,ismax,ismin,
@@ -38,7 +38,7 @@
     smax,smin,snrm2,simatcopy,somatcopy,
     srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
     ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
-    strmm,strmv,strsm,strsv, sgeadd,ssum);
+    strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
      
 @blasobjsz = (
     izamax,izamin,,
@@ -48,7 +48,7 @@
     zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
     ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
     zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
-    zgeadd, dzsum);
+    zgeadd, dzsum, zgemmt);
 
 @blasobjs = (lsame, xerbla);
 @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@@ -60,7 +60,7 @@
     cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, 
     cblas_scnrm2, cblas_scasum,
     cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
-    );
+    cblas_cgemmt);
 @cblasobjsd = (
     cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
     cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
@@ -69,7 +69,7 @@
     cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
     cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
     cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
-    );
+    cblas_dgemmt);
     
 @cblasobjss = (
     cblas_sasum, cblas_saxpy, cblas_saxpby,
@@ -80,7 +80,7 @@
     cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
     cblas_strsv, cblas_sgeadd,
     cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
-    );
+    cblas_sgemmt);
 @cblasobjsz = (
     cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
     cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@@ -90,7 +90,7 @@
     cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
     cblas_zaxpby, cblas_zgeadd,
     cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
-);
+    cblas_zgemmt);
 
 @cblasobjs = (  cblas_xerbla );
 
diff --git a/f_check b/f_check
index d071e016e..526c41dc6 100755
--- a/f_check
+++ b/f_check
@@ -101,7 +101,14 @@ else
                         *flang*)
                             vendor=FLANG
                             openmp='-fopenmp'
-                            ;;
+			    data=`$compiler -v 2>&1 > /dev/null `
+                	    v="${data#*version *}"
+                	    v="${v%%*.}"
+                	    major="${v%%.*}"
+                	    if [ "$major" -ge 17 ]; then
+                        	vendor=FLANGNEW
+			    fi	
+			;;
                         *ifort*|*ifx*)
                             vendor=INTEL
                             openmp='-fopenmp'
diff --git a/getarch.c b/getarch.c
index 937a8db68..87384c084 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
 
 #ifdef MAKE_NB_JOBS
   #if MAKE_NB_JOBS > 0
-    printf("MAKE += -j %d\n", MAKE_NB_JOBS);
+    printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
   #else
     // Let make use parent -j argument or -j1 if there
     // is no make parent
   #endif
 #elif NO_PARALLEL_MAKE==1
-    printf("MAKE += -j 1\n");
+    printf("MAKEFLAGS += -j 1\n");
 #else
-    printf("MAKE += -j %d\n", get_num_cores());
+    printf("MAKEFLAGS += -j %d\n", get_num_cores());
 #endif
 
     break;
diff --git a/interface/geadd.c b/interface/geadd.c
index f0befa14a..3a0ea015d 100644
--- a/interface/geadd.c
+++ b/interface/geadd.c
@@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
   info = 0;
 
 
-  if (lda < MAX(1, m))	info = 6;
+  if (lda < MAX(1, m))	info = 5;
   if (ldc < MAX(1, m))	info = 8;
 
   if (n < 0)		info = 2;
diff --git a/interface/gemm.c b/interface/gemm.c
index 71cc77a1b..4778b641b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
 #endif
 #endif
 
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+#define XFEATURE_XTILEDATA 18
+#define ARCH_REQ_XCOMP_PERM 0x1023
+static int openblas_amxtile_permission = 0;
+static int init_amxtile_permission() {
+  long status =
+      syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  if (status != 0) {
+    fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
+                    "Intel Sapphier Rapids), skip sbgemm calculation\n");
+    return -1;
+  }
+  openblas_amxtile_permission = 1;
+  return 0;
+}
+#endif
+
 #ifndef CBLAS
 
 void NAME(char *TRANSA, char *TRANSB,
@@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #endif
 
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+#if defined(DYNAMIC_ARCH)
+  if (gotoblas->need_amxtile_permission &&
+      openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
+  if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#endif  // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+
   if ((args.m == 0) || (args.n == 0)) return;
 
 #if 0
diff --git a/interface/gemmt.c b/interface/gemmt.c
index 3eed1dfe4..046432670 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -35,29 +35,26 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "common.h"
-#ifdef FUNCTION_PROFILE
-#include "functable.h"
-#endif
 
 #ifndef COMPLEX
 #define SMP_THRESHOLD_MIN 65536.0
 #ifdef XDOUBLE
-#define ERROR_NAME "QGEMT "
+#define ERROR_NAME "QGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "DGEMT "
+#define ERROR_NAME "DGEMMT "
 #elif defined(BFLOAT16)
-#define ERROR_NAME "SBGEMT "
+#define ERROR_NAME "SBGEMMT "
 #else
-#define ERROR_NAME "SGEMT "
+#define ERROR_NAME "SGEMMT "
 #endif
 #else
 #define SMP_THRESHOLD_MIN 8192.0
 #ifdef XDOUBLE
-#define ERROR_NAME "XGEMT "
+#define ERROR_NAME "XGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "ZGEMT "
+#define ERROR_NAME "ZGEMMT "
 #else
-#define ERROR_NAME "CGEMT "
+#define ERROR_NAME "CGEMMT "
 #endif
 #endif
 
@@ -68,18 +65,19 @@
 #ifndef CBLAS
 
 void NAME(char *UPLO, char *TRANSA, char *TRANSB,
-	  blasint * M, blasint * N, blasint * K,
+	  blasint * M, blasint * K,
 	  FLOAT * Alpha,
 	  IFLOAT * a, blasint * ldA,
 	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
 {
 
-	blasint m, n, k;
+	blasint m, k;
 	blasint lda, ldb, ldc;
 	int transa, transb, uplo;
 	blasint info;
 
 	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@@ -92,7 +90,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	PRINT_DEBUG_NAME;
 
 	m = *M;
-	n = *N;
 	k = *K;
 
 #if defined(COMPLEX)
@@ -159,32 +156,39 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	if (Uplo == 'L')
 		uplo = 1;
 
+	nrowa = m;
+	if (transa) nrowa = k;
+	nrowb = k;
+	if (transb) nrowb = m;
+
 	info = 0;
 
-	if (uplo < 0)
-		info = 14;
-	if (ldc < m)
+	if (ldc < MAX(1, m))
 		info = 13;
+	if (ldb < MAX(1, nrowa))
+		info = 10;
+	if (lda < MAX(1, nrowb))
+		info = 8;
 	if (k < 0)
 		info = 5;
-	if (n < 0)
-		info = 4;
 	if (m < 0)
-		info = 3;
+		info = 4;
 	if (transb < 0)
-		info = 2;
+		info = 3;
 	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
 		info = 1;
 
-	if (info) {
+	if (info != 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
 	}
 #else
 
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
-	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
-	   blasint N, blasint k,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
 	   IFLOAT * A, blasint LDA,
@@ -205,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	int transa, transb, uplo;
 	blasint info;
-	blasint m, n, lda, ldb;
+	blasint lda, ldb;
 	FLOAT *a, *b;
 	XFLOAT *buffer;
 
 	PRINT_DEBUG_CNAME;
 
+	uplo = -1;
 	transa = -1;
 	transb = -1;
 	info = 0;
 
 	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
 
 		if (TransA == CblasNoTrans)
 			transa = 0;
@@ -248,9 +255,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			transb = 3;
 #endif
 
-		m = M;
-		n = N;
-
 		a = (void *)A;
 		b = (void *)B;
 		lda = LDA;
@@ -258,23 +262,31 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint nrowa, nrowb;
+		nrowa = m;
+		if (transa) nrowa = k;
+		nrowb = k;
+		if (transb) nrowb = m;
+
+		if (ldc < MAX(1, m))
 			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
 	}
 
 	if (order == CblasRowMajor) {
-		m = N;
-		n = M;
 
 		a = (void *)B;
 		b = (void *)A;
@@ -282,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		lda = LDB;
 		ldb = LDA;
 
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
 		if (TransB == CblasNoTrans)
 			transa = 0;
 		if (TransB == CblasTrans)
@@ -315,29 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint ncola, ncolb;
+		ncola = k;
+		if (transa) ncola = m;
+		ncolb = m;
+		if (transb) ncolb = k;
+
+		if (ldc < MAX(1,m))
 			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 10;
+		if (lda < MAX(1, ncola))
+			info = 8;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
-
 	}
 
-	uplo = -1;
-	if (Uplo == CblasUpper)
-		uplo = 0;
-	if (Uplo == CblasLower)
-		uplo = 1;
-	if (uplo < 0)
-		info = 14;
-
 	if (info >= 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
@@ -407,37 +423,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #endif
 
-	if ((m == 0) || (n == 0))
+	if (m == 0)
 		return;
 
 	IDEBUG_START;
 
-	FUNCTION_PROFILE_START();
-
 	const blasint incb = (transb == 0) ? 1 : ldb;
 
 	if (uplo == 1) {
-		for (i = 0; i < n; i++) {
-			j = n - i;
+		for (i = 0; i < m; i++) {
+			j = m - i;
 
 			l = j;
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
 			if (transa) {
-				l = k;
 				aa = a + lda * i * 2;
-				bb = b + i * 2;
 			}
+			if (transb)
+				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
 			if (transa) {
-				l = k;
 				aa = a + lda * i;
-				bb = b + i;
 			}
+			if (transb)
+				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
 
@@ -458,8 +472,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -479,20 +491,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     aa, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
+						     bb, incb, cc, 1, buffer);
 #endif
 #ifdef SMP
 			} else {
-
+				if (!transa)
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
 							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, cc,
+							    1, buffer,
+							    nthreads);
 
 			}
 #endif
@@ -501,21 +527,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		}
 	} else {
 
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < m; i++) {
 			j = i + 1;
 
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transb) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transb) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@@ -537,8 +561,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -558,30 +580,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     a, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
+						     incb, cc, 1, buffer);
 #endif
 
 #ifdef SMP
 			} else {
-
+				if (!transa)
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
-
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, cc, 1,
+							    buffer, nthreads);
 			}
 #endif
 
 			STACK_FREE(buffer);
 		}
 	}
-	FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
-			     args.m * args.k + args.k * args.n +
-			     args.m * args.n, 2 * args.m * args.n * args.k);
 
 	IDEBUG_END;
 
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 91975f7f4..4cf0966cc 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 8;
 	}
 
 	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
@@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
 #ifdef NEW_IMATCOPY
-    if ( *lda == *ldb && *rows == *cols) {
+    if ( *lda == *ldb ) {
         if ( order == BlasColMajor )
         {
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
         else
@@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
-        return; 
     }
-
 #endif
 
-	if ( *lda >  *ldb )
-		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
-	else
-		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
+	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
 	{
-		printf("Memory alloc failed\n");
+		printf("Memory alloc failed in imatcopy\n");
 		exit(1);
 	}
 
@@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
+			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 	}
 	else
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
 		}
 	}
 
diff --git a/interface/nrm2.c b/interface/nrm2.c
index dc8c08e9a..331ebc3d0 100644
--- a/interface/nrm2.c
+++ b/interface/nrm2.c
@@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
   if (n <= 0) return 0.;
 
+#ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
@@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
 
   if (n <= 0) return 0.;
 
+  #ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
+  
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/rotg.c b/interface/rotg.c
index 69443a5a0..8d40d9c53 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
@@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
 #endif
 
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+#else
+  long double safmin = FLT_MIN;
+#endif
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
   long double da = *DA;
   long double db = *DB;
   long double c;
   long double s;
-  long double r, roe, z;
+  long double r, z;
+  long double sigma, dascal,dbscal;
 
   long double ada = fabsl(da);
   long double adb = fabsl(db);
-  long double scale = ada + adb;
+  long double maxab = MAX(ada,adb);
+  long double safmax;
+  long double scale;
+
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
-
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  } else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
-    r = sqrt(da * da + db * db);
-    if (roe < 0) r = -r;
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+    if (ada > adb)
+	sigma = copysign(1.,da);
+    else
+	sigma = copysign(1.,db);
+    dascal = da / scale;
+    dbscal = db / scale;
+    r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
     c = da / r;
     s = db / r;
     z = ONE;
@@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   FLOAT db = *DB;
   FLOAT c  = *C;
   FLOAT s  = *S;
-  FLOAT r, roe, z;
+  FLOAT sigma;
+  FLOAT r, z;
 
   FLOAT ada = fabs(da);
   FLOAT adb = fabs(db);
-  FLOAT scale = ada + adb;
+  FLOAT maxab = MAX(ada,adb);
+  long double safmax ;
+  FLOAT scale ;
+
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+
+  if (ada > adb)
+	sigma = copysign(1.,da);
+    else
+	sigma = copysign(1.,db);
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -77,20 +108,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
 
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  } else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
     FLOAT aa = da / scale;
     FLOAT bb = db / scale;
 
-    r = scale * sqrt(aa * aa + bb * bb);
-    if (roe < 0) r = -r;
+    r = sigma * scale * sqrt(aa * aa + bb * bb);
     c = da / r;
     s = db / r;
     z = ONE;
diff --git a/interface/symm.c b/interface/symm.c
index 3e65e69b1..3e6e0fd48 100644
--- a/interface/symm.c
+++ b/interface/symm.c
@@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
   int nodes;
 #endif
 # if defined(SMP)
-  int MN;
+  double MN;
 #endif
   blasint info;
   int side;
@@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
   int nodes;
 #endif
 #if defined(SMP)
-  int MN;
+  double MN;
 #endif
 
   PRINT_DEBUG_CNAME;
diff --git a/interface/syrk.c b/interface/syrk.c
index 3b056aec8..69f2328a4 100644
--- a/interface/syrk.c
+++ b/interface/syrk.c
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
   FLOAT *sa, *sb;
 
 #ifdef SMP
-  int NNK;
+  double NNK;
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
   FLOAT *sa, *sb;
 
 #ifdef SMP
-int NNK;
+double NNK;
 
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index ecda5ef4e..b0b32dc87 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
 #ifdef NEW_IMATCOPY
-    if (*lda == *ldb && *cols == *rows) {
+    if (*lda == *ldb ) {
         if ( order == BlasColMajor )
         {
 
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
         else
         {
@@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
-        return; 
     }
 #endif
 
-	if ( *lda >  *ldb )
-                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
-        else
-                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
-
-        b = malloc(msize);
-        if ( b == NULL )
-        {
-                printf("Memory alloc failed in zimatcopy\n");
-                exit(1);
-        }
+	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
 
+	b = malloc(msize);
+	if ( b == NULL )
+	{
+		printf("Memory alloc failed in zimatcopy\n");
+		exit(1);
+	}
 
 	if ( order == BlasColMajor )
 	{
 
 		if ( trans == BlasNoTrans )
 		{
-	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
-			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 
 	}
@@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
-			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 
 	}
+
 	free(b);
 	return;
 
diff --git a/interface/zrotg.c b/interface/zrotg.c
index 123f4da85..af6f85c1c 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
@@ -14,53 +16,28 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     FLOAT *S  = (FLOAT*) VS;
 #endif /* CBLAS */
 
-#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
-
-  long double da_r = *(DA + 0);
-  long double da_i = *(DA + 1);
-  long double db_r = *(DB + 0);
-  long double db_i = *(DB + 1);
-  long double r;
-
-  long double ada = fabsl(da_r) + fabsl(da_i);
-
-  PRINT_DEBUG_NAME;
-
-  IDEBUG_START;
-
-  FUNCTION_PROFILE_START();
-
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
-    *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    long double alpha_r, alpha_i;
-
-    ada = sqrt(da_r * da_r + da_i * da_i);
-
-    r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+  long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
+#else
+  long double safmin = FLT_MIN;
+  long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
+#endif
 
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
 
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
-  }
-#else
-  FLOAT da_r = *(DA + 0);
-  FLOAT da_i = *(DA + 1);
-  FLOAT db_r = *(DB + 0);
-  FLOAT db_i = *(DB + 1);
-  FLOAT r;
+  FLOAT da_r = *(DA+0);
+  FLOAT da_i = *(DA+1);
+  FLOAT db_r = *(DB+0);
+  FLOAT db_i = *(DB+1);
+  //long double r;
+  FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
+  FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
+  long double d;
 
-  FLOAT ada = fabs(da_r) + fabs(da_i);
-  FLOAT adb;
+  FLOAT ada =  da_r * da_r + da_i * da_i; 
+  FLOAT adb =  db_r * db_r + db_i * db_i; 
+  FLOAT adart = sqrt( da_r * da_r + da_i * da_i); 
+  FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); 
 
   PRINT_DEBUG_NAME;
 
@@ -68,69 +45,137 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 
   FUNCTION_PROFILE_START();
 
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
+  if (db_r == ZERO && db_i == ZERO) {
+    *C        = ONE;
+    *(S  + 0) = ZERO;
     *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    FLOAT scale;
-    FLOAT aa_r, aa_i, bb_r, bb_i;
-    FLOAT alpha_r, alpha_i;
-
-    aa_r = fabs(da_r);
-    aa_i = fabs(da_i);
-
-    if (aa_i > aa_r) {
-      aa_r = fabs(da_i);
-      aa_i = fabs(da_r);
-    }
-
-    if (aa_r == ZERO) {
-	ada = 0.;
-    } else {
-        scale = (aa_i / aa_r);
-        ada = aa_r * sqrt(ONE + scale * scale);
-    }
-
-    bb_r = fabs(db_r);
-    bb_i = fabs(db_i);
-
-    if (bb_i > bb_r) {
-      bb_r = fabs(bb_i);
-      bb_i = fabs(bb_r);
-    }
-
-    if (bb_r == ZERO) {
-	adb = 0.;
-    } else {
-    	scale = (bb_i / bb_r);
-    	adb = bb_r * sqrt(ONE + scale * scale);
-    }
-    scale = ada + adb;
-
-    aa_r    = da_r / scale;
-    aa_i    = da_i / scale;
-    bb_r    = db_r / scale;
-    bb_i    = db_i / scale;
-
-    r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
-
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
-
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
+    return;
   }
-#endif
 
-  FUNCTION_PROFILE_END(4, 4, 4);
-
-  IDEBUG_END;
-
-  return;
+  long double safmax = 1./safmin;
+#if defined DOUBLE
+  long double rtmax = safmax /DBL_EPSILON;
+#else
+  long double rtmax = safmax /FLT_EPSILON;
+#endif
+		*(S1 + 0) = *(DB + 0);
+		*(S1 + 1) = *(DB + 1) *-1;
+	if (da_r == ZERO && da_i == ZERO) {
+	    *C = ZERO;	  
+	    if (db_r == ZERO) {
+		    (*DA) = fabsl(db_i);
+		*S = *S1 /da_r;
+		*(S+1) = *(S1+1) /da_r;
+		return;
+	    } else if ( db_i == ZERO) {
+		    *DA = fabsl(db_r);
+		*S = *S1 /da_r;
+		*(S+1) = *(S1+1) /da_r;
+		return;
+	    } else {
+	        long double g1 = MAX( fabsl(db_r), fabsl(db_i));
+	        rtmax =sqrt(safmax/2.);
+	        if (g1 > rtmin && g1 < rtmax) { // unscaled
+		    d = sqrt(adb);
+		    *S = *S1 /d;
+		    *(S+1) = *(S1+1) /d;
+		    *DA = d ;
+		    *(DA+1) = ZERO;
+		    return;
+    	        } else { // scaled algorithm
+		  long double u = MIN ( safmax, MAX ( safmin, g1));
+		    FLOAT gs_r = db_r/u;
+		    FLOAT gs_i = db_i/u;
+		    d = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    *S = gs_r / d;
+		    *(S + 1) = (gs_i * -1) / d;
+		    *DA = d * u;
+		    *(DA+1) = ZERO;
+		    return;
+	        }
+	    }
+	} else {
+	       FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
+	       FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
+	       rtmax = sqrt(safmax / 4.);
+	       if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
+		    long double h = ada + adb;
+	   	    double adahsq = sqrt(ada * h);
+		    if (ada >= h *safmin) {
+			*C = sqrt(ada/h);
+			*R = *DA / *C;
+			*(R+1) = *(DA+1) / *(C+1);
+			rtmax *= 2.;
+			if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
+				*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
+				*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
+			} else {
+				*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
+				*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
+			}
+	    	    } else {
+		        *C = ada / adahsq;
+		        if (*C >= safmin) 
+			    *R = *DA / *C;
+		        else
+			    *R = *DA * (h / adahsq);
+		        *S = *S1 * ada / adahsq;
+		    	*(S+1) = *(S1+1) * ada / adahsq;
+		    }
+		    *DA=*R;
+		    *(DA+1)=*(R+1);
+		    return;
+	        } else {	// scaled
+		   FLOAT fs_r, fs_i, gs_r, gs_i;
+		long double v,w,f2,g2,h;
+		long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
+		    gs_r = db_r/u;
+		    gs_i = db_i/u;
+		    g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    if (f1 /u < rtmin) {
+			v = MIN (safmax, MAX (safmin, f1));
+			w = v / u;
+			fs_r = *DA/ v;
+			fs_i = *(DA+1) / v;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+		        h = f2 * w * w + g2;		
+		    } else { // use same scaling for both
+			w = 1.;
+			fs_r = *DA/ u;
+			fs_i = *(DA+1) / u;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+			h = f2 + g2;
+		    }
+		    if ( f2 >= h * safmin) {
+			    *C = sqrt ( f2 / h );
+			    *DA = fs_r / *C;
+			    *(DA+1) = fs_i / *C;
+			    rtmax *= 2;
+			    if ( f2 > rtmin && h < rtmax) {
+				    *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
+				    *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
+			    } else {
+				    *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
+				    *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
+			    }
+		    } else { // intermediates might overflow
+			d = sqrt ( f2 * h);
+			*C = f2 /d;
+			if (*C >= safmin) {
+				*DA = fs_r / *C;
+				*(DA+1) = fs_i / *C;
+			} else {
+				*DA = fs_r * (h / d);
+				*(DA+1) = fs_i / (h / d);
+			}
+			*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
+			*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
+	            }
+		    *C *= w;
+		    *DA *= u;
+		    *(DA+1) *= u;
+		    return;
+		}
+	}
 }
+	
\ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 977886044..1e0a0074f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -33,7 +33,7 @@ endif
 ifdef TARGET_CORE
 ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
   override CFLAGS += -march=sapphirerapids
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f
@@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  endif
 else ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
   override CFLAGS += -march=cooperlake
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f
@@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
 else ifeq ($(TARGET_CORE), LOONGSON3R4)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
+else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
+ ifeq ($(C_COMPILER), PGI)
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
+ else
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
+ endif
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index bea6cb048..174a1d41b 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -35,6 +35,12 @@ USE_TRMM = 1
 endif
 endif
 
+ifneq ($(DYNAMIC_ARCH), 1)
+ifeq ($(TARGET), MIPS64_GENERIC)
+USE_TRMM = 1
+endif
+endif
+
 ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index bd25f7cd8..ccbce27e1 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -57,7 +57,7 @@ CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 
 SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 
@@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
@@ -128,10 +118,10 @@ SGEMM_BETA     = sgemm_beta.S
 SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
 
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
 
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -149,8 +139,8 @@ SSYMMLCOPY_M    =  symm_lcopy_sve.c
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
+DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
 DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
 DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
@@ -170,8 +160,8 @@ DSYMMLCOPY_M    =  symm_lcopy_sve.c
 CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
@@ -194,8 +184,8 @@ CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
 ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index 9a5938459..bc5999097 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,189 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifeq ($(SGEMM_UNROLL_N), 16)
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-endif
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-endif
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
index 24e08a646..f100adc7a 100644
--- a/kernel/arm64/cgemm_kernel_8x4.S
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
index 29a68ff22..2c63925be 100644
--- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
+++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
index 38770f66b..2136ebbee 100644
--- a/kernel/arm64/cgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     add pB, pB, 32
 
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rw  z15.s, p0/z,  [pB, 28]
 
     add pB, pB, 32
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.s, p1/m, z2.s, z15.s
     ld1rw  z15.s, p0/z,  [pB, 28]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 32
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z3.s, z15.s
 	OP_ri	z23.s, p1/m, z2.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z1.s, z15.s
 	OP_ri	z23.s, p1/m, z0.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
 	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
 	fmla	z28.s, p1/m, z20.s, alphaz_R
 	fmls	z28.s, p1/m, z21.s, alphaz_I
@@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.s, p1/m, z23.s, alphaz_R
 	st2w 	{z30.s, z31.s}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, s0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, s1
@@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lcgemm_kernel_L4_Mv1_46
 
 .Lcgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lcgemm_kernel_L4_Mv1_END:
diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
index 6aa44a8f6..2fdaf5fcd 100644
--- a/kernel/arm64/cgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
 
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
index 748cd954e..086a2fed1 100644
--- a/kernel/arm64/cgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
     } while (svptest_any(svptrue_b32(), pg));
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
index 5c0827397..e8f1d8cf3 100644
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
-#define temp		x19
-#define tempOffset	x20
-#define tempK		x21
+#define alphaI		w19
+#define temp		x20
+#define tempOffset	x21
+#define tempK		x22
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1f812c775..000000000
--- a/kernel/arm64/dgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint64_t lda_vec = svindex_s64(0LL, lda);
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c
deleted file mode 100644
index cb645a1b6..000000000
--- a/kernel/arm64/dgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 8460e0d5e..9c057551e 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
         BLASLONG sve_width = SVE_WIDTH;
 
         for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT(i, n);
-                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
+                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
 
                 SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
                 SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index e342b0b63..6077c85dd 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common.h"
-
+#include <float.h>
 #include <arm_neon.h>
 
 #if defined(SMP)
@@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
 	nrm2_compute(n, x, inc_x, &ssq, &scale);
 #endif
-	if (fabs(scale) <1.e-300) return 0.;
+	volatile FLOAT sca = fabs(scale);
+	if (sca < DBL_MIN) return 0.;
 	ssq = sqrt(ssq) * scale;
 
 	return ssq;
diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..90f867b44
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)           \
+    a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec);       \
+    a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec);   \
+    svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag));             \
+    a_offset_inner += 2;                                                \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
+    SV_TYPE a_vec_real;
+    SV_TYPE a_vec_imag;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c
new file mode 100644
index 000000000..7b2a2e767
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#define SV_PREFETCH svprfd_gather_index
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#define SV_PREFETCH svprfw_gather_index
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec);    \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner++;                                           \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..975166a2e
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64x2_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32x2_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld2(pg, a_offset_inner);                          \
+    svst2(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda * 2;                                  \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
+
diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c
new file mode 100644
index 000000000..9a93b6cb7
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@@ -0,0 +1,125 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1(pg, a_offset_inner);                          \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda;                                      \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S
old mode 100755
new mode 100644
diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1bc186335..000000000
--- a/kernel/arm64/sgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint32_t lda_vec = svindex_s32(0LL, lda);
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c
deleted file mode 100644
index 9f8cf502a..000000000
--- a/kernel/arm64/sgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c
index 6ba4afc8b..e138f0647 100644
--- a/kernel/arm64/symm_lcopy_sve.c
+++ b/kernel/arm64/symm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c
index 32da5bd16..9a4cb6d4f 100644
--- a/kernel/arm64/symm_ucopy_sve.c
+++ b/kernel/arm64/symm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c
index 918e945ac..c7f79e3fd 100644
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c
index b76cc56de..b3ba68973 100644
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c
index 75fa163ae..a47d2096c 100644
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c
index 36a03242a..c5188beb4 100644
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
index 5a9d4194a..2895eb85d 100644
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
index ac4019e26..fdda992e0 100644
--- a/kernel/arm64/trsm_ltcopy_sve.c
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
index 8fdcd0f4b..1a03aa974 100644
--- a/kernel/arm64/trsm_uncopy_sve.c
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
index 0f5f0dccd..b06166f36 100644
--- a/kernel/arm64/trsm_utcopy_sve.c
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c
index 728f97fb3..6f65e5cfd 100644
--- a/kernel/arm64/zdot_thunderx2t99.c
+++ b/kernel/arm64/zdot_thunderx2t99.c
@@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#if (NVCOMPVERS < 2309)
+#pragma opt 1
+#endif
+#endif
 
 #include "common.h"
 
diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
index d5b35775c..a043948d6 100644
--- a/kernel/arm64/zgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.d, p1/m, z2.d, z15.d
     ld1rd  z15.d, p0/z,  [pB, 56]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z3.d, z14.d
 	OP_ii	z22.d, p1/m, z3.d, z15.d
 	OP_ri	z23.d, p1/m, z2.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z1.d, z14.d
 	OP_ii	z22.d, p1/m, z1.d, z15.d
 	OP_ri	z23.d, p1/m, z0.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
 	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
 	fmla	z28.d, p1/m, z20.d, alphaz_R
@@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.d, p1/m, z23.d, alphaz_R
 	st2d 	{z30.d, z31.d}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, d0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, d1
@@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lzgemm_kernel_L4_Mv1_46
 
 .Lzgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lzgemm_kernel_L4_Mv1_END:
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
index 8f9b4268a..6b8c93baf 100644
--- a/kernel/arm64/zgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
 
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
index c6e50bc1c..fd8d2190f 100644
--- a/kernel/arm64/zgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
     } while (svptest_any(svptrue_b64(), pg));
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
index 37dbfe4e1..fcf7e7073 100644
--- a/kernel/arm64/zhemm_ltcopy_sve.c
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
index 21e03b7be..056c9824e 100644
--- a/kernel/arm64/zhemm_utcopy_sve.c
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 #else
@@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
index 6f18aa956..5a17d3b19 100644
--- a/kernel/arm64/zsymm_lcopy_sve.c
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
index 6be48cdaf..06989e3aa 100644
--- a/kernel/arm64/zsymm_ucopy_sve.c
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
index d34f607ab..5a7171d9d 100644
--- a/kernel/arm64/ztrmm_lncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
index 7f34c9857..3a88f26b2 100644
--- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
index 7eb9452c9..c3dbdcbe3 100644
--- a/kernel/arm64/ztrmm_uncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
index 60c8ff3b4..ddfa7ba4e 100644
--- a/kernel/arm64/ztrmm_utcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
index eb7cd0294..f81ba14c2 100644
--- a/kernel/arm64/ztrsm_lncopy_sve.c
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
index 34dbf8a30..46a11abed 100644
--- a/kernel/arm64/ztrsm_ltcopy_sve.c
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
index 92e086b75..436112130 100644
--- a/kernel/arm64/ztrsm_uncopy_sve.c
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
index ccb942e1b..ddf3e265f 100644
--- a/kernel/arm64/ztrsm_utcopy_sve.c
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c
old mode 100755
new mode 100644
diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c
index 08f85e891..5833a64ef 100644
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
   BLASLONG i, ii, j, jj;
 
-  FLOAT data01, data02;
+  FLOAT data01=0.0, data02=0.0;
   FLOAT *a1;
 
   lda *= 2;
diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c
index 387bb2532..bc495f7c6 100644
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   FLOAT data05, data06, data07, data08;
   FLOAT *a1, *a2;
 
+  data01=data02=data07=data08=0.0;
   lda *= 2;
 
   jj = offset;
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index cda359040..67d1fd11c 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,3 +1,4 @@
+ifndef NO_LASX
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
@@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+DGEMVNKERNEL = dgemv_n_8_lasx.S
+DGEMVTKERNEL = dgemv_t_8_lasx.S
+
+SGEMMKERNEL  = sgemm_kernel_16x8_lasx.S
+SGEMMINCOPY  = sgemm_ncopy_16_lasx.S
+SGEMMITCOPY  = sgemm_tcopy_16_lasx.S
+SGEMMONCOPY  = sgemm_ncopy_8_lasx.S
+SGEMMOTCOPY  = sgemm_tcopy_8_lasx.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
index b772a6f82..213add9ee 100644
--- a/kernel/loongarch64/KERNEL.generic
+++ b/kernel/loongarch64/KERNEL.generic
@@ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c
 ZSWAPKERNEL  = ../arm/zswap.c
 
 SGEMVNKERNEL = ../arm/gemv_n.c
+ifndef DGEMVNKERNEL
 DGEMVNKERNEL = ../arm/gemv_n.c
+endif
 CGEMVNKERNEL = ../arm/zgemv_n.c
 ZGEMVNKERNEL = ../arm/zgemv_n.c
 
 SGEMVTKERNEL = ../arm/gemv_t.c
+ifndef DGEMVTKERNEL
 DGEMVTKERNEL = ../arm/gemv_t.c
+endif
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
index 9d27987e1..41667485a 100644
--- a/kernel/loongarch64/cnrm2.S
+++ b/kernel/loongarch64/cnrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d  I, N, 2
    bge $r0,    I, .L25
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
index 13faa977e..f8e26fda2 100644
--- a/kernel/loongarch64/dgemm_kernel_16x4.S
+++ b/kernel/loongarch64/dgemm_kernel_16x4.S
@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+/*********************************************************************
+* 2023/06/28 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/06/28 guxiwei
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  4
+*       DGEMM_DEFAULT_UNROLL_M  16
+*       DGEMM_DEFAULT_P         32
+*       DGEMM_DEFAULT_Q         152
+*       DGEMM_DEFAULT_R         858
+*       A_PR1                   1024
+*       B_PR1                   256
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       36.0 GFLOPS
+*       2 threads:      71.6 GFLOPS
+*       3 threads:     101.5 GFLOPS
+*       4 threads:     132.8 GFLOPS
+*********************************************************************/
+
 /* Function parameters */
 #define M      $r4   // param 1: bm
 #define N      $r5   // param 2: bn
@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define U4     $xr4
 #define U5     $xr5
 #define U6     $xr6
-#define D0     $xr7
-#define D1     $xr8
-#define D2     $xr9
-#define D3     $xr10
-#define D4     $xr11
-#define D5     $xr12
-#define D6     $xr13
-#define D7     $xr14
-#define D8     $xr15
-#define D9     $xr16
-#define D10    $xr17
-#define D11    $xr18
-#define D12    $xr19
-#define D13    $xr20
-#define D14    $xr21
-#define D15    $xr22
-#define VALPHA $xr23
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define VALPHA $xr15
 
 /* Prefetch interval */
-#define A_PRE  0x200
+#define A_PRE  0x400
 #define B_PRE  0x100
 
-    PROLOGUE
-
-    addi.d   $sp,   $sp,   -56
-    /* Store regs */
-    SDARG    $r23,  $sp,   0
-    SDARG    $r24,  $sp,   8
-    SDARG    $r25,  $sp,   16
-    SDARG    $r26,  $sp,   24
-    SDARG    $r27,  $sp,   32
-    ST       $f23,  $sp,   40
-    ST       ALPHA, $sp,   48
-
-    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
-    xvld         VALPHA, $sp,  48
-    xvreplve0.d  VALPHA, VALPHA
-
-#if defined (TRMMKERNEL) && !defined(LEFT)
-    sub.d   OFF,   ZERO,  OFFSET
-#else
-    xor     OFF,   OFF,   OFF
-#endif
-
-    /* if (!(N >> 2)) goto L_N3 */
-    srai.d   J,     N,     2     /* J = bn >> 2 */
-    andi     N,     N,     0x03
-    beq      ZERO,  J,     .L_N3
-
-.L_J1: /* J-- && This loop include Condition 1 */
-
-/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
-*                                                   dgemm_core_16x4                                */
-    move     C0,    C
-    move     A0,    A
-    slli.d   T0,    LDC,   3
-    add.d    C1,    C0,    T0
-    addi.d   J,     J,     -1   /* J-- */
-    add.d    C2,    C1,    T0
-    add.d    C3,    C2,    T0
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-    move     OFF,   OFFSET
-#endif
-
-    /* if (!(M >> 4)) goto L_M8 */
-    srai.d   I,     M,     4     /* I = bm >> 4 */
-    beq      ZERO,  I,     .L_M8
-
-.L_I1: /* I-- */
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x07
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  16
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-    /* Calculate the first set of D0~D15,
-     * avoidig set 0 operation
-     * Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
+.macro KERNEL2x16x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    preld          0,  C0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    preld     0,   C0, 0x40
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    preld          0,   C1,    0x00
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    preld     0,   C1,    0x40
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x10
-    preld          0,   C2,    0x00
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
-    preld     0,   C2,    0x40
-    xvfmul.d  D10, U2, U4
-    xvfmul.d  D11, U3, U4
-
-    xvldrepl.d     U4, B0, 0x18
-    preld          0,   C3,    0x00
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
-    preld     0,   C3,    0x40
-    xvfmul.d  D14, U2, U4
-    xvfmul.d  D15, U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_L7 */
-    beq       ZERO,TL, .L_L7
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    /* Calculate 8 sets of D0~D15 */
-.L_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x16x4_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
 
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U12, B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4_END
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_TL1
+    xvld     U8,   A0,    0x00
 
-   /* Maybe we need calculate the last
-    * 7 sets of D0~D15?
-    */
-.L_L7:
-    /* if (!(L & 7)) goto L_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_L0
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
 
-.L_L71:
-    /* Load 16 * 64 from A0 */
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_L71
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
 
-.L_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D2,   D2,  VALPHA
-    xvfmul.d  D3,   D3,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D6,   D6,  VALPHA
-    xvfmul.d  D7,   D7,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D10,  D10, VALPHA
-    xvfmul.d  D11,  D11, VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-    xvfmul.d  D14,  D14, VALPHA
-    xvfmul.d  D15,  D15, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvld      U2,  C0,  0x40
-    xvld      U3,  C0,  0x60
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
-    xvfmadd.d D2,  D2,  VALPHA,  U2
-    xvfmadd.d D3,  D3,  VALPHA,  U3
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvld      U2,  C2,  0x40
-    xvld      U3,  C2,  0x60
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
-    xvfmadd.d D10, D10, VALPHA,  U2
-    xvfmadd.d D11, D11, VALPHA,  U3
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvld      U2,  C3,  0x40
-    xvld      U3,  C3,  0x60
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-    xvfmadd.d D14, D14, VALPHA,  U2
-    xvfmadd.d D15, D15, VALPHA,  U3
-#endif // #if defined(TRMMKERNEL)
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    xvst      D2,  C0,  0x40
-    xvst      D3,  C0,  0x60
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    xvst      D6,  C1,  0x40
-    xvst      D7,  C1,  0x60
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    xvst      D10, C2,  0x40
-    xvst      D11, C2,  0x60
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
-    xvst      D14, C3,  0x40
-    xvst      D15, C3,  0x60
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x80
-    addi.d    C1,  C1,  0x80
-    addi.d    C2,  C2,  0x80
-    addi.d    C3,  C3,  0x80
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values  in A */
-    addi.d    L,   L,   -16
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x07
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
 
-#ifdef LEFT
-    addi.d    OFF, OFF, 0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
 
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_I1
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-.L_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_M0
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_M4
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif  // #if defined(TRMMKERNEL)
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
 
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M8_L7 */
-    beq       ZERO,TL, .L_M8_L7
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-.L_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-2***/
+.macro KERNEL2x16x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x8x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
     addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-4***/
+.macro KERNEL2x8x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
+.macro KERNEL2x4x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-6***/
+.macro KERNEL2x4x2_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
 
-           /***8-7***/
+.macro KERNEL2x2x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-           /***8-8***/
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M8_TL1
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
 
-.L_M8_L7:
-    /* if (!(L & 7)) goto L_M8_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M8_L0
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-.L_M8_L71:
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M8_L71
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-.L_M8_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
+    xvldrepl.d U4,  B0, 0x00
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-#endif   // #if defined(TRMMKERNEL)
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x40
-    addi.d    C1,  C1,  0x40
-    addi.d    C2,  C2,  0x40
-    addi.d    C3,  C3,  0x40
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -8
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x06
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
 
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
 
-/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-.L_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_M2
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    add.d    B0,    B,    T0
-#endif
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
 
-    /* Load 4 * 64 from A0 */
+.macro KERNEL2x8x1_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M4_L7 */
-    beq       ZERO,TL, .L_M4_L7
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
 
-.L_M4_TL1: /* TL-- */
-           /***8-1***/
+.macro KERNEL2x4x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
     addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-2***/
+.macro KERNEL2x4x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x2x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-4***/
+.macro KERNEL2x2x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
 
-           /***8-5***/
+.macro KERNEL2x1x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-6***/
+.macro KERNEL2x1x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
     xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    PROLOGUE
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    addi.d   $sp,   $sp,   -120
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f23,  $sp,   40
+    ST       $f24,  $sp,   48
+    ST       $f25,  $sp,   56
+    ST       $f26,  $sp,   64
+    ST       $f27,  $sp,   72
+    ST       $f28,  $sp,   80
+    ST       $f29,  $sp,   88
+    ST       $f30,  $sp,   96
+    ST       $f31,  $sp,   104
+    ST       ALPHA, $sp,   112
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    xvldrepl.d  VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_J1: /* J-- && This loop include Condition 1 */
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
 
-           /***8-8***/
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d     U4, B0, 0x00
+    preld          0,  C0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    preld     0,   C0, 0x40
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U5, B0, 0x08
+    preld          0,   C1,    0x00
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    preld     0,   C1,    0x40
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d     U6, B0, 0x10
+    preld          0,   C2,    0x00
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    preld     0,   C2,    0x40
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d     U7, B0, 0x18
+    preld          0,   C3,    0x00
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    preld     0,   C3,    0x40
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
 
-    addi.d     A0,  A0, 0x20
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x16x4
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M4_TL1
+    blt       ZERO,TL, .L_TL1
 
-.L_M4_L7:
-    /* if (!(L & 7)) goto L_M4_L0 */
+.L_TL1_END:
+    KERNEL8x16x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M4_L0
+    beq       TL,  ZERO,.L_L0
 
-.L_M4_L71:
+.L_L71:
+    /* Load 16 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
+    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x20
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M4_L71
+    blt        ZERO,TL, .L_L71
 
-.L_M4_L0:
+.L_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D10,  D10, VALPHA
+    xvfmul.d  D11,  D11, VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+    xvfmul.d  D14,  D14, VALPHA
+    xvfmul.d  D15,  D15, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U8,  C2,  0x00
+    xvld      U9,  C2,  0x20
+    xvld      U10, C2,  0x40
+    xvld      U11, C2,  0x60
+    xvfmadd.d D8,  D8,  VALPHA,  U8
+    xvfmadd.d D9,  D9,  VALPHA,  U9
+    xvfmadd.d D10, D10, VALPHA,  U10
+    xvfmadd.d D11, D11, VALPHA,  U11
 
     /* Load C3  */
     xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvld      U2,  C3,  0x40
+    xvld      U3,  C3,  0x60
     xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvfmadd.d D13, D13, VALPHA,  U1
+    xvfmadd.d D14, D14, VALPHA,  U2
+    xvfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
     xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
     /* Store C1 */
     xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
     /* Store C2 */
     xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    xvst      D10, C2,  0x40
+    xvst      D11, C2,  0x60
     /* Store C3 */
     xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+    xvst      D14, C3,  0x40
+    xvst      D15, C3,  0x60
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x20
-    addi.d    C1,  C1,  0x20
-    addi.d    C2,  C2,  0x20
-    addi.d    C3,  C3,  0x20
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+    addi.d    C2,  C2,  0x80
+    addi.d    C3,  C3,  0x80
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -4
+    /* number of values  in A */
+    addi.d    L,   L,   -16
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x05
+    slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
 #endif
 
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x04
+    addi.d    OFF, OFF, 0x10
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
 
-.L_M2:
-    andi      I,   M,   2
-    beq       ZERO,I,   .L_M1
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x04
+    slli.d   T0,    OFF,  0x06
     add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
     add.d    B0,    B,    T0
@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  2
+    addi.d   L,     OFF,  8
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else   // #if !defined(TRMMKERNEL)
     move     B0,    B
     move     L,     K /* L = bk */
-#endif
+#endif  // #if defined(TRMMKERNEL)
 
-    /* Load 2 * 64 from A0 */
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x10
+    addi.d    A0,  A0, 0x40
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M2_L7 */
-    beq       ZERO,TL, .L_M2_L7
-
-.L_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d    TL,  TL,  -1
 
-    addi.d     A0,  A0, 0x10
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    beq    ZERO,    TL,  .L_M8_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
+.L_M8_TL1: /* TL-- */
+    KERNEL8x8x4
 
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M2_TL1
+    blt       ZERO,TL, .L_M8_TL1
 
-.L_M2_L7:
-    /* if (!(L & 7)) goto L_M2_L0 */
+.L_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M2_L0
+    beq       TL,  ZERO,.L_M8_L0
 
-.L_M2_L71:
+.L_M8_L71:
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x10
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M2_L71
+    blt        ZERO,TL, .L_M8_L71
 
-.L_M2_L0:
+.L_M8_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+    xvfmadd.d D8,  D8,  VALPHA,  U4
+    xvfmadd.d D9,  D9,  VALPHA,  U5
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+    xvfmadd.d D12, D12, VALPHA,  U6
+    xvfmadd.d D13, D13, VALPHA,  U7
 #endif   // #if defined(TRMMKERNEL)
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
-    xvstelm.d D0,  C0,  0x08,    0x01
-    xvstelm.d D4,  C1,  0x08,    0x01
-    xvstelm.d D8,  C2,  0x08,    0x01
-    xvstelm.d D12, C3,  0x08,    0x01
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x10
-    addi.d    C1,  C1,  0x10
-    addi.d    C2,  C2,  0x10
-    addi.d    C3,  C3,  0x10
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
     /* number of values in A */
-    addi.d    L,   L,   -2
+    addi.d    L,   L,   -8
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x04
+    slli.d    T0,  L,  0x06
     add.d     A0,  A0, T0
     slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef LEFT
     /* number of values in A */
-    addi.d    OFF,   OFF,  0x02
+    addi.d    OFF,   OFF,  0x08
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+/********LOOP (if(N >> 2 ) && (M & 8)) End************/
 
-.L_M1:
-    andi      I,   M,   1
-    beq       ZERO,I,   .L_M0
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x03
-    add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
     add.d    B0,    B,    T0
 #endif
 
@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  1
+    addi.d   L,     OFF,  4
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     move     L,     K /* L = bk */
 #endif
 
-    /* Load 1 * 64 from A0 */
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
+    addi.d    A0,  A0, 0x20
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M1_L7 */
-    beq       ZERO,TL, .L_M1_L7
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
 
-.L_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    beq    ZERO,    TL,  .L_M4_TL1_END
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
 
-           /***8-2***/
+.L_M4_L71:
     xvld     U0,   A0,    0x00
 
     xvldrepl.d U4,  B0, 0x00
@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x18
     xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x08
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-           /***8-3***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M4_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U2
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U3
+#endif   // #if defined(TRMMKERNEL)
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    /* Store C3 */
+    xvst      D12, C3,  0x00
 
-           /***8-4***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
 
-    addi.d     A0,  A0, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,     B0,     0x00
+
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld    U4,  B0, 0x00
 
-    addi.d     A0,  A0, 0x08
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M2_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
+    xvstelm.d D1,  C0,  0x08,    0x00
+    xvstelm.d D1,  C1,  0x08,    0x01
+    xvstelm.d D1,  C2,  0x08,    0x02
+    xvstelm.d D1,  C3,  0x08,    0x03
+#else
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvpermi.q   U2, U0, 0x20
+    xvpermi.q   U3, U1, 0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d   D0, D4, VALPHA, U2
+    xvfmadd.d   D1, D5, VALPHA, U3
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d  U8,     A0,     0x00
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
 
 .L_M1_L7:
     /* if (!(L & 7)) goto L_M1_L0 */
@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     beq       TL,  ZERO,.L_M1_L0
 
 .L_M1_L71:
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     blt        ZERO,TL, .L_M1_L71
 
 .L_M1_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
 #else
     /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvldrepl.d     U0,  C0,  0x00
+    xvfmadd.d D4,  D0,  VALPHA,  U0
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvldrepl.d     U1,  C1,  0x00
+    xvfmadd.d D5,  D0,  VALPHA,  U1
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvldrepl.d     U2,  C2,  0x00
+    xvfmadd.d D6,  D0,  VALPHA,  U2
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvldrepl.d     U3,  C3,  0x00
+    xvfmadd.d D7,  D0,  VALPHA,  U3
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
+    xvstelm.d D4,  C0,  0x00,    0x00
+    xvstelm.d D5,  C1,  0x00,    0x01
+    xvstelm.d D6,  C2,  0x00,    0x02
+    xvstelm.d D7,  C3,  0x00,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
     /* Add stride for C */
     addi.d    C0,  C0,  0x08
@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////
 /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
 
+    xvldrepl.d  VALPHA, $sp, 112
+
 .L_N3:
     andi     J,    N,   2
     beq      ZERO, J,   .L_N1
@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     addi.d   L,     OFF,  2
 #endif
 #else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_L7 */
-    beq       ZERO,TL, .L_N3_L7
-
-.L_N3_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
     xvld     U2,   A0,    0x40
     xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x16x2_END
 
 .L_N3_L7:
     /* if (!(L & 7)) goto L_N3_L0 */
@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmul.d  D0,  U0, U4
     xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
     /* Add stride for A0 and B0 */
     addi.d    A0,  A0, 0x40
@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M8_L7 */
     beq       ZERO,TL, .L_N3_M8_L7
 
-.L_N3_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    beq    ZERO,    TL,  .L_N3_M8_TL1_END
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
+.L_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M8_TL1
+.L_N3_M8_TL1_END:
+    KERNEL8x8x2_END
 
 .L_N3_M8_L7:
     /* if (!(L & 7)) goto L_N3_M8_L0 */
@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x40
@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d     B0,  B0, T0
 #endif
 
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-/********LOOP (if(N & 2) && (M & 8) ) End************/
-
-.L_N3_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_N3_M2
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x04
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 4 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M4_L7 */
-    beq       ZERO,TL, .L_N3_M4_L7
-
-.L_N3_M4_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+/********LOOP (if(N & 2) && (M & 8) ) End************/
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_N3_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-6***/
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
 
 .L_N3_M4_L7:
     /* if (!(L & 7)) goto L_N3_M4_L0 */
@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x20
@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M2_L7 */
     beq       ZERO,TL, .L_N3_M2_L7
 
-.L_N3_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+    xvld     U8,   A0,    0x00
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
 
 .L_N3_M2_L7:
     /* if (!(L & 7)) goto L_N3_M2_L0 */
@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x10
@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     /* number of values in B */
     addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M1_L7 */
-    beq       ZERO,TL, .L_N3_M1_L7
-
-.L_N3_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 1 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
 
 .L_N3_M1_L7:
     /* if (!(L & 7)) goto L_N3_M1_L0 */
@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_L7 */
     beq       ZERO,TL, .L_N1_L7
 
-.L_N1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x16x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x16x1_END
 
 .L_N1_L7:
     /* if (!(L & 7)) goto L_N1_L0 */
@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
     slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x03
-    add.d     B0,  B0, T0
-#endif
-
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_N1_I1
-
-.L_N1_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_N1_M0
-
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_N1_M4
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x03
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  1
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x08
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N1_M8_L7 */
-    beq       ZERO,TL, .L_N1_M8_L7
-
-.L_N1_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x08
 
+    beq    ZERO,    TL,  .L_N1_M8_TL1_END
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M8_TL1
 
+.L_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
 .L_N1_M8_L7:
     /* if (!(L & 7)) goto L_N1_M8_L0 */
     andi      TL,  L,   7
@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M4_L7 */
     beq       ZERO,TL, .L_N1_M4_L7
 
-.L_N1_M4_TL1: /* TL-- */
-           /***8-1***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
 
 .L_N1_M4_L7:
     /* if (!(L & 7)) goto L_N1_M4_L0 */
@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M2_L7 */
     beq       ZERO,TL, .L_N1_M2_L7
 
-.L_N1_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
 
 .L_N1_M2_L7:
     /* if (!(L & 7)) goto L_N1_M2_L0 */
@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M1_L7 */
     beq       ZERO,TL, .L_N1_M1_L7
 
-.L_N1_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
 
 .L_N1_M1_L7:
     /* if (!(L & 7)) goto L_N1_M1_L0 */
@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     LDARG    $r26,  $sp,   24
     LDARG    $r27,  $sp,   32
     LD       $f23,  $sp,   40
-    addi.d   $sp,   $sp,   56
+    LD       $f24,  $sp,   48
+    LD       $f25,  $sp,   56
+    LD       $f26,  $sp,   64
+    LD       $f27,  $sp,   72
+    LD       $f28,  $sp,   80
+    LD       $f29,  $sp,   88
+    LD       $f30,  $sp,   96
+    LD       $f31,  $sp,   104
+    addi.d   $sp,   $sp,   120
 
     jirl    $r0, $r1, 0x0
 
diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
new file mode 100644
index 000000000..c6523f9ab
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -0,0 +1,554 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/14 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+
+.macro DLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GMUL xvf, d, X0, X0, VALPHA
+.endm
+
+.macro DLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DLOAD_Y_4
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro DLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro DSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DSTORE_Y_4
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro DSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+// Unable to use vector load/store ins
+.macro DLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro DLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
+.endm
+
+.macro DSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro DSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+.endm
+
+.macro DLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
+                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
+                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
+                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
+                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
+.endm
+
+.macro DGEMV_N_4x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, \
+                        A2,  PA1, 0, \
+                        A4,  PA2, 0, \
+                        A6,  PA3, 0, \
+                        A8,  PA4, 0, \
+                        A10, PA5, 0, \
+                        A12, PA6, 0, \
+                        A14, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, \
+                  Y0, A6,  X3, Y0, \
+                  Y0, A8,  X4, Y0, \
+                  Y0, A10, X5, Y0, \
+                  Y0, A12, X6, Y0, \
+                  Y0, A14, X7, Y0
+.endm
+
+.macro DGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, \
+                $f10, $f18, $f5, $f10, \
+                $f10, $f20, $f6, $f10, \
+                $f10, $f22, $f7, $f10, \
+                $f10, $f24, $f8, $f10, \
+                $f10, $f26, $f9, $f10,
+.endm
+
+.macro DGEMV_N_8x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
+.endm
+
+.macro DGEMV_N_4x4
+    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
+.endm
+
+.macro DGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
+.endm
+
+.macro DGEMV_N_8x2
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
+.endm
+
+.macro DGEMV_N_4x2
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0
+.endm
+
+.macro DGEMV_N_1x2
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10
+.endm
+
+.macro DGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    fmadd.d $f10,   $f12,   $f2,    $f10
+.endm
+
+.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    DLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x8
+    DSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x8
+    DSTORE_\Y_4
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x8
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    DLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x4
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x4
+    DSTORE_\Y_4
+    PTR_ALSL  Y,    INC_Y,  Y,  2
+    PTR_ADDI  K,    K,      4
+.L_\XW\()_N_4_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x4
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    DLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x2
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x2
+    DSTORE_\Y_4
+    PTR_ALSL  Y,        INC_Y,      Y,      2
+    PTR_ADDI  K,        K,          4
+.L_\XW\()_N_2_M_3:
+    andi    I,      M,      3
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x2
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    DLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x1
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 24 + 4
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 24 + 4
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
new file mode 100644
index 000000000..7f57c1d88
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -0,0 +1,481 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/17 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro DLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro DLOAD_X4
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro DLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro DLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+.endm
+
+.macro DGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
+                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
+                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
+                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
+                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
+.endm
+
+.macro DGEMV_T_8x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
+                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
+                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
+                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
+.endm
+
+.macro DGEMV_T_4x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
+.endm
+
+.macro DGEMV_T_4x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
+.endm
+
+.macro DGEMV_T_2x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
+.endm
+
+.macro DGEMV_T_2x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
+.endm
+
+.macro DGEMV_T XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\X8
+    DGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\X4
+    DGEMV_T_8x4
+    PTR_ALSL    X,  INC_X,  X,  2
+.L_\XW\()_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
+                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
+                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\X8
+    DGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\X4
+    DGEMV_T_4x4
+    PTR_ALSL  X,      INC_X,  X,  2
+.L_\XW\()_N_4_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\X8
+    DGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\X4
+    DGEMV_T_2x4
+    PTR_ALSL  X,        INC_X,      X,      2
+.L_\XW\()_N_2_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    fmadd.d $f19,   $f3,    $f1,    $f19
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+    fld.d     $f3,    Y,      0x00
+    fmadd.d   $f3,    ALPHA,  $f19,  $f3
+    fst.d     $f3,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 24 + 3
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    DGEMV_T GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    DGEMV_T GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 24 + 3
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
index ff937ae53..2160b93a6 100644
--- a/kernel/loongarch64/dnrm2.S
+++ b/kernel/loongarch64/dnrm2.S
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    NOP
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
new file mode 100644
index 000000000..694dcdaa9
--- /dev/null
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -0,0 +1,430 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#if __loongarch_grlen == 64
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
+#define PTR_ALSL  alsl.d
+#elif __loongarch_grlen == 32
+#define LA_REG    int32_t
+#define REG_SIZE  4
+#define REG_LOG   2
+#define PTR_ADDI  addi.w
+#define PTR_ADD   add.w
+#define PTR_SUB   sub.w
+#define PTR_LD    ld.w
+#define PTR_ST    st.w
+#define PTR_SLLI  slli.w
+#define PTR_SRLI  srli.w
+#define PTR_SRAI  srai.w
+#define PTR_MUL   mul.w
+#define PTR_ALSL  alsl.w
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
+#define PTR_ALSL  alsl.d
+#endif
+
+#if __loongarch_frlen == 64
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#elif __loongarch_frlen == 32
+#define FREG_SIZE 4
+#define FREG_LOG  2
+#define PTR_FLD   fld.s
+#define PTR_FST   fst.s
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#endif
+
+// The max registers available to the user which
+// do not need to be preserved across calls.
+// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
+#define MAX_INT_CALLER_SAVED 17
+#define MAX_FP_CALLER_SAVED  24
+
+.altmacro // Enable alternate macro mode
+
+.macro push_if_used regs, fregs
+.if \regs > MAX_INT_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
+    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+.endif
+.if \fregs > MAX_FP_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
+    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+.endif
+.endm // End push_if_used
+.macro pop_if_used regs, fregs
+.if \fregs > MAX_FP_CALLER_SAVED
+    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
+.endif
+.if \regs > MAX_INT_CALLER_SAVED
+    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
+.endif
+.endm // End pop_if_used
+.macro push_regs from, to
+    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    push_regs %from + 1, \to
+.endif
+.endm // End push_regs
+.macro pop_regs from, to
+    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    pop_regs %from + 1, \to
+.endif
+.endm // End pop_regs
+.macro push_fregs from, to
+    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    push_fregs %from + 1, \to
+.endif
+.endm // End push_fregs
+.macro pop_fregs from, to
+    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    pop_fregs %from + 1, \to
+.endif
+.endm // End pop_fregs
+
+//
+// Instruction Related Macros
+//
+// GLD
+//
+.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLD \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GLD_INC
+//
+.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+    PTR_ADDI  \src,   \src,   \inc
+.ifnb \more
+    GLD_INC \pre_op, \suf_op, \inc, \more
+.endif
+.endm
+//
+// GLDX is same as GLD except the stride is a register
+//
+.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ldx    \out,   \src,   \offset
+.else
+    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLDX \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GLDREPL
+//
+.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
+    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
+.ifnb \more
+    GLDREPL  \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GST
+//
+.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()st    \src,   \dst,   \offset
+.else
+    \pre_op\()st.\suf_op \src,  \dst, \offset
+.endif
+.ifnb \more
+    GST \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMUL
+//
+.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
+.ifnb \more
+    GMUL \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMADD
+//
+.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
+.ifnb \more
+    GMADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADD
+//
+.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()add.\suf_op \out, \in0, \in1
+.ifnb \more
+    GADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADDI
+//
+.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GADDI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSUB
+//
+.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()sub.\suf_op \out, \in0, \in1
+.ifnb \more
+    GSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSLLI
+//
+.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GSLLI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GINSVE0
+//
+.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GINSVE0 \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GXOR
+//
+.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GXOR \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GPERMI
+//
+.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()permi.\suf_op     \out,   \in0,   \in1
+.ifnb \more
+    GPERMI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GNMSUB
+//
+.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()nmsub.\suf_op     \out,   \in0,   \in1,   \in2
+.ifnb \more
+    GNMSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GPRELD
+//
+.macro GPRELD in0:req, in1:req, in2:req, more:vararg
+    preld \in0, \in1, \in2
+.ifnb \more
+    GPRELD \more
+.endif
+.endm
+
+//
+// Compound instructions
+//
+// GACC: Accumulate the values of vector registers
+//
+.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "xv"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    xvpackod.h             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    xvpackod.b             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifeqs "\pre_op", "v"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    vpackod.h              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    vpackod.b              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifnb \more
+    GACC \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMOV
+//
+.macro GMOV pre_op:req, out:req, in:req, more:vararg
+    \pre_op\()or.v  \out, \in, \in
+.ifnb \more
+    GMOV \pre_op, \more
+.endif
+.endm
+
+//
+// Media Related Macros
+//
+.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()ilvl.\suf_op \out0, \in0, \in1
+    \pre_op\()ilvh.\suf_op \out1, \in0, \in1
+.endm
+.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()pickev.\suf_op \out0, \in0, \in1
+    \pre_op\()pickod.\suf_op \out1, \in0, \in1
+.endm
+
+//
+// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
+// has no pre_op param. 128-bit vector instructions are not supported.
+//
+.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+                       vt0, vt1
+    GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
+    GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
+    GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
+    GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
+.endm
+
+.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
+                       in0, in1, in2, in3, in4, in5, in6, in7, \
+                       tmp0, tmp1, tmp2, tmp3
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
+    GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
+
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
+    GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
+
+    GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
+
+    GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
+                  \out2, \out6, 0x02, \out3, \out7, 0x02, \
+                  \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
+                  \out6, \tmp2, 0x31, \out7, \tmp3, 0x31
+.endm
diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
new file mode 100644
index 000000000..bd609394e
--- /dev/null
+++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
@@ -0,0 +1,2348 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/08/23 guxiwei
+* Parameter:
+*       SGEMM_DEFAULT_UNROLL_N  8
+*       SGEMM_DEFAULT_UNROLL_M  16
+*       SGEMM_DEFAULT_P         256
+*       SGEMM_DEFAULT_Q         256
+*       SGEMM_DEFAULT_R         1024
+*       A_PRE                   1024
+*       B_PRE                   256    // Enable prefetching for B results in a performance decrease, temporarily disabled.
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       71.7 GFLOPS
+*       2 threads:     142.6 GFLOPS
+*       3 threads:     211.5 GFLOPS
+*       4 threads:     265.0 GFLOPS
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define C4     $r25
+#define C5     $r26
+#define C6     $r27
+#define C7     $r28
+#define T0     $r29
+#define T1     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX Vectors
+ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
+ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
+ * Use D0 to D15 to store intermediate values of the computation.
+ * Use VALPHA to store the broadcast value of alpha
+ */
+#define U0     $xr0
+#define U1     $xr1
+#define X0     $xr2
+#define X1     $xr3
+#define X2     $xr4
+#define X3     $xr5
+#define X4     $xr6
+#define X5     $xr7
+#define X6     $xr8
+#define X7     $xr9
+#define D0     $xr10
+#define D1     $xr11
+#define D2     $xr12
+#define D3     $xr13
+#define D4     $xr14
+#define D5     $xr15
+#define D6     $xr16
+#define D7     $xr17
+#define D8     $xr18
+#define D9     $xr19
+#define D10    $xr20
+#define D11    $xr21
+#define D12    $xr22
+#define D13    $xr23
+#define D14    $xr24
+#define D15    $xr25
+#define VALPHA $xr26
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+// Loops outline:
+// .L_N8  <--------------------------------------------------------------------------------------------    /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
+// |    .L_M16 <---------------------                                                                  |   /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
+// |    |  .L_M16_TL1               |                                                                  |
+// |    |  .L_M16_L7                | The entire core loop of the function, KERNEK16x8                 |
+// |    |  .L_M16_L71               |                                                                  |
+// |    |  .L_M16_L0 ----------------                                                                  |
+// |    .L_M8                                                                                          |
+// |    |  .L_M8_TL1                |                                                                  |
+// |    |  .L_M8_L7                 | KERNEK8x8                                                        |
+// |    |  .L_M8_L71                |                                                                  |
+// |    |  .L_M8_L0                 |                                                                  |
+// |    .L_M4                                                                                          |
+// |    |  .L_M4_TL1                |                                                                  |
+// |    |  .L_M4_L7                 | KERNEK4x8                                                        |
+// |    |  .L_M4_L71                |                                                                  |
+// |    |  .L_M4_L0                 |                                                                  |
+// |    .L_M2                                                                                          |
+// |    |  .L_M2_TL1                |                                                                  |
+// |    |  .L_M2_L7                 | KERNEK2x8                                                        |
+// |    |  .L_M2_L71                |                                                                  |
+// |    |  .L_M2_L0                 |                                                                  |
+// |    .L_M1                                                                                          |
+// |    |  .L_M1_TL1                |                                                                  |
+// |    |  .L_M1_L7                 | KERNEK1x8                                                        |
+// |    |  .L_M1_L71                |                                                                  |
+// |    |  .L_M1_L0                 |                                                                  |
+// |    .L_M0------------------------------------------------------------------------------------------
+// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
+// .L_N4
+// |    .L_N4_M16 <---------------------
+// |    |  .L_N4_M16_TL1               |
+// |    |  .L_N4_M16_L7                |  KERNEL16x4
+// |    |  .L_N4_M16_L71               |
+// |    |  .L_N4_M16_L0 ----------------
+// |    .L_N4_M8
+// |    |  .L_N4_M8_TL1                |
+// |    |  .L_N4_M8_L7                 | KERNEL8x4
+// |    |  .L_N4_M8_L71                |
+// |    |  .L_N4_M8_L0                 |
+// |    .L_N4_M4
+// |    |  .L_N4_M4_TL1                |
+// |    |  .L_N4_M4_L7                 | KERNEL4x4
+// |    |  .L_N4_M4_L71                |
+// |    |  .L_N4_M4_L0                 |
+// |    .L_N4_M2
+// |    |  .L_N4_M2_TL1                |
+// |    |  .L_N4_M2_L7                 | KERNEL2x4
+// |    |  .L_N4_M2_L71                |
+// |    |  .L_N4_M2_L0                 |
+// |    .L_N4_M1
+// |    |  .L_N4_M1_TL1                |
+// |    |  .L_N4_M1_L7                 | KERNEL1x4
+// |    |  .L_N4_M1_L71                |
+// |    |  .L_N4_M1_L0                 |
+// |    .L_N4_M0
+// .L_N3     /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
+// .L_N2
+// |    .L_N2_M16 <---------------------
+// |    |  .L_N2_M16_TL1               |
+// |    |  .L_N2_M16_L7                | KERNEL16x2
+// |    |  .L_N2_M16_L71               |
+// |    |  .L_N2_M16_L0 ----------------
+// |    .L_N2_M8
+// |    |  .L_N2_M8_TL1                 |
+// |    |  .L_N2_M8_L7                  | KERNEL8x2
+// |    |  .L_N2_M8_L71                 |
+// |    |  .L_N2_M8_L0                  |
+// |    .L_N2_M4
+// |    |  .L_N2_M4_TL1                 |
+// |    |  .L_N2_M4_L7                  | KERNEL4x2
+// |    |  .L_N2_M4_L71                 |
+// |    |  .L_N2_M4_L0                  |
+// |    .L_N2_M2
+// |    |  .L_N2_M2_TL1                 |
+// |    |  .L_N2_M2_L7                  | KERNEL2x2
+// |    |  .L_N2_M2_L71                 |
+// |    |  .L_N2_M2_L0                  |
+// |    .L_N2_M1
+// |    |  .L_N2_M1_TL1                 |
+// |    |  .L_N2_M1_L7                  | KERNEL1x2
+// |    |  .L_N2_M1_L71                 |
+// |    |  .L_N2_M1_L0                  |
+// |    .L_N2_M0
+// .L_N1
+// |    .L_N1_M16 <---------------------
+// |    |  .L_N1_M16_TL1               |
+// |    |  .L_N1_M16_L7                | KERNEL16x1
+// |    |  .L_N1_M16_L71               |
+// |    |  .L_N1_M16_L0 ----------------
+// |    .L_N1_M8
+// |    |  .L_N1_M8_TL1                |
+// |    |  .L_N1_M8_L7                 | KERNEL8x1
+// |    |  .L_N1_M8_L71                |
+// |    |  .L_N1_M8_L0                 |
+// |    .L_N1_M4
+// |    |  .L_N1_M4_TL1                |
+// |    |  .L_N1_M4_L7                 | KERNEL4x1
+// |    |  .L_N1_M4_L71                |
+// |    |  .L_N1_M4_L0                 |
+// |    .L_N1_M2
+// |    |  .L_N1_M2_TL1                |
+// |    |  .L_N1_M2_L7                 | KERNEL2x1
+// |    |  .L_N1_M2_L71                |
+// |    |  .L_N1_M2_L0                 |
+// |    .L_N1_M1
+// |    |  .L_N1_M1_TL1                |
+// |    |  .L_N1_M1_L7                 | KERNEL1x1
+// |    |  .L_N1_M1_L71                |
+// |    |  .L_N1_M1_L0                 |
+// |    .L_N1_M0
+// .L_N0
+
+/*************** sgemm_kernel_macros ***************/
+.macro KERNEL1x16x8_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    preld   0,  C0, 0x00
+    GMUL xvf, s, D2, U0, X1, D3, U1, X1
+    preld   0,  C1, 0x00
+    GMUL xvf, s, D4, U0, X2, D5, U1, X2
+    preld   0,  C2, 0x00
+    GMUL xvf, s, D6, U0, X3, D7, U1, X3
+    preld   0,  C3, 0x00
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D9,  U1, X4
+    preld   0,  C4, 0x00
+    GMUL xvf, s, D10, U0, X5, D11, U1, X5
+    preld   0,  C5, 0x00
+    GMUL xvf, s, D12, U0, X6, D13, U1, X6
+    preld   0,  C6, 0x00
+    GMUL xvf, s, D14, U0, X7, D15, U1, X7
+    preld   0,  C7, 0x00
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1x16x8
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+    preld      0,   A0, A_PRE
+    GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D9,  U1, X4, D9,  \
+                  D10, U0, X5, D10, D11, U1, X5, D11
+    //preld      0,   B0, B_PRE
+    GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
+                  D14, U0, X7, D14, D15, U1, X7  D15
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8x16x8
+.rept 8
+    KERNEL1x16x8
+.endr
+.endm
+
+.macro SAVE16x8
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA, \
+                 D8,  D8,  VALPHA, D9,  D9,  VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
+                 D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+    /* Load C4  */
+    GLD xv, , X0, C4, 0x00, X1, C4, 0x20
+    GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
+    /* Load C5  */
+    GLD xv, , X2, C5, 0x00, X3, C5, 0x20
+    GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
+    /* Load C6  */
+    GLD xv, , X4, C6, 0x00, X5, C6, 0x20
+    GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
+    /* Load C7  */
+    GLD xv, , X6, C7, 0x00, X7, C7, 0x20
+    GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20, \
+              D8,  C4, 0x00, D9,  C4, 0x20, \
+              D10, C5, 0x00, D11, C5, 0x20, \
+              D12, C6, 0x00, D13, C6, 0x20, \
+              D14, C7, 0x00, D15, C7, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx8_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D10, U0, X5, \
+                 D12, U0, X6, D14, U0, X7
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1xMx8 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D10, U0, X5, D10, \
+                  D12, U0, X6, D12, D14, U0, X7, D14
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8xMx8 m, stride
+.rept 8
+    KERNEL1xMx8 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx8 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA, \
+                 D8,  D8,  VALPHA, D10, D10, VALPHA, \
+                 D12, D12, VALPHA, D14, D14, VALPHA
+#else
+    /* Load C0, C1, C2, C3, C4, C5, C6, C7 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+.if \m == 8
+    GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
+.elseif \m == 4
+    GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.endif
+    GMADD xvf, s, D8,  D8,  VALPHA, X0, D10, D10, VALPHA, X2, \
+                  D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00, \
+              D8,  C4, 0x00, D10, C5, 0x00, \
+              D12, C6, 0x00, D14, C7, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00, \
+             $vr18, C4, 0x00, $vr20, C5, 0x00, \
+             $vr22, C6, 0x00, $vr24, C7, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x4_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1, \
+                 D4, U0, X2, D5, U1, X2, \
+                 D6, U0, X3, D7, U1, X3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1x16x4
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3, \
+                  D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8x16x4
+.rept 8
+    KERNEL1x16x4
+.endr
+.endm
+
+.macro SAVE16x4
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx4_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1xMx4 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8xMx4 m, stride
+.rept 8
+    KERNEL1xMx4 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx4 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA
+#else
+    /* Load C0, C1, C2, C3 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x2_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1x16x2
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8x16x2
+.rept 8
+    KERNEL1x16x2
+.endr
+.endm
+
+.macro SAVE16x2
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx2_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1xMx2 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8xMx2 m, stride
+.rept 8
+    KERNEL1xMx2 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx2 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x1_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1x16x1
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8x16x1
+.rept 8
+    KERNEL1x16x1
+.endr
+.endm
+
+.macro SAVE16x1
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40
+#else
+    GADDI , d, C0, C0, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx1_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1xMx1 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8xMx1 m, stride
+.rept 8
+    KERNEL1xMx1 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx1 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride
+#else
+    GADDI , d, C0, C0, \stride
+#endif
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    xvreplve0.w   VALPHA,     $xr0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    PTR_SUB   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+    /* if (!(N >> 3)) goto L_N7 */
+    PTR_SRAI   J,     N,     3     /* J = bn >> 3 */
+    andi     N,     N,     0x07
+    beq      ZERO,  J,     .L_N7
+.L_N8: /* J -- */
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADDI   J,     J,     -1   /* J-- */
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#elif __loongarch_grlen == 32
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
+.align 5
+.L_M16: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x8_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M16_L7 */
+    beq       ZERO,TL, .L_M16_L7
+.align 5
+.L_M16_TL1:
+    KERNEL8x16x8
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M16_TL1
+.L_M16_L7:
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M16_L0
+.align 5
+.L_M16_L71:
+    KERNEL1x16x8
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M16_L71
+.L_M16_L0:
+    SAVE16x8
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    PTR_ADDI    L,   L,   -16
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF, OFF, 0x10 /* number of values in A */
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_M16
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0 /* A0 += 8 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif  // #if defined(TRMMKERNEL)
+    KERNEL1xMx8_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
+.align 5
+.L_M8_TL1:
+    KERNEL8xMx8 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M8_TL1
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M8_L0
+.align 5
+.L_M8_L71:
+    KERNEL1xMx8 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M8_L71
+.L_M8_L0:
+    SAVEMx8 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -8
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0 /* A0 += 4 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+.align 5
+.L_M4_TL1:
+    KERNEL8xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+.L_M4_L71:
+    KERNEL1xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+.L_M4_L0:
+    SAVEMx8 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -4
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 2, 0x08
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+.align 5
+.L_M2_TL1:
+    KERNEL8xMx8 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+.align 5
+.L_M2_L71:
+    KERNEL1xMx8 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+.L_M2_L0:
+    SAVEMx8 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -2
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+.align 5
+.L_M1_TL1:
+    KERNEL8xMx8 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+.align 5
+.L_M1_L71:
+    KERNEL1xMx8 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+.L_M1_L0:
+    SAVEMx8 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -1
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    PTR_SLLI    T0,   K,   5
+    PTR_SLLI    T1,   LDC, 5
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x08 /* number of values in B */
+#endif
+    blt      ZERO,  J,   .L_N8
+
+.L_N7:
+    andi     J,    N,   4
+    beq      ZERO, J,   .L_N3
+.L_N4:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#elif __loongarch_grlen == 32
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N4_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N4_M8
+.align 5
+.L_N4_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0 /* B0 += 4 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x4_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_L7 */
+    beq       ZERO,TL, .L_N4_M16_L7
+.align 5
+.L_N4_M16_TL1: /* TL-- */
+    KERNEL8x16x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M16_TL1
+.L_N4_M16_L7:
+    /* if (!(L & 7)) goto L_N4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M16_L0
+.align 5
+.L_N4_M16_L71:
+    KERNEL1x16x4
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M16_L71
+.L_N4_M16_L0:
+    SAVE16x4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N4_M16
+.L_N4_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N4_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N4_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M8_L7 */
+    beq       ZERO,TL, .L_N4_M8_L7
+.align 5
+.L_N4_M8_TL1: /* TL-- */
+    KERNEL8xMx4 8, 0x20
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M8_TL1
+.L_N4_M8_L7:
+    /* if (!(L & 7)) goto L_N4_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M8_L0
+.align 5
+.L_N4_M8_L71:
+    KERNEL1xMx4 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M8_L71
+.L_N4_M8_L0:
+    SAVEMx4 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N4_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M4_L7 */
+    beq       ZERO,TL, .L_N4_M4_L7
+.align 5
+.L_N4_M4_TL1: /* TL-- */
+    KERNEL8xMx4 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M4_TL1
+.L_N4_M4_L7:
+    /* if (!(L & 7)) goto L_N4_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M4_L0
+.align 5
+.L_N4_M4_L71:
+    KERNEL1xMx4 4, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M4_L71
+.L_N4_M4_L0:
+    SAVEMx4 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N4_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M2_L7 */
+    beq       ZERO,TL, .L_N4_M2_L7
+.align 5
+.L_N4_M2_TL1: /* TL-- */
+    KERNEL8xMx4 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M2_TL1
+.L_N4_M2_L7:
+    /* if (!(L & 7)) goto L_N4_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M2_L0
+.align 5
+.L_N4_M2_L71:
+    KERNEL1xMx4 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M2_L71
+.L_N4_M2_L0:
+    SAVEMx4 2, 0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N4_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M1_L7 */
+    beq       ZERO,TL, .L_N4_M1_L7
+.align 5
+.L_N4_M1_TL1: /* TL-- */
+    KERNEL8xMx4 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M1_TL1
+.L_N4_M1_L7:
+    /* if (!(L & 7)) goto L_N4_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M1_L0
+.align 5
+.L_N4_M1_L71:
+    KERNEL1xMx4 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M1_L71
+.L_N4_M1_L0:
+    SAVEMx4 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M0:
+    /* Add stride for B and C
+     * B += 4 * K
+     * C += 4 * LDC
+     */
+    PTR_SLLI    T0,   K,   4
+    PTR_SLLI    T1,   LDC, 4
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x04
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+.L_N2:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADD    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N2_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N2_M8
+.align 5
+.L_N2_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x2_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M16_L7 */
+    beq       ZERO,TL, .L_N2_M16_L7
+.align 5
+.L_N2_M16_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M16_TL1
+.L_N2_M16_L7:
+    /* if (!(L & 7)) goto L_N2_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M16_L0
+.align 5
+.L_N2_M16_L71:
+    KERNEL1x16x2
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M16_L71
+.L_N2_M16_L0:
+    SAVE16x2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N2_M16
+.L_N2_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N2_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N2_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M8_L7 */
+    beq       ZERO,TL, .L_N2_M8_L7
+.align 5
+.L_N2_M8_TL1: /* TL-- */
+    KERNEL8xMx2 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M8_TL1
+.L_N2_M8_L7:
+    /* if (!(L & 7)) goto L_N2_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M8_L0
+.align 5
+.L_N2_M8_L71:
+    KERNEL1xMx2 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M8_L71
+.L_N2_M8_L0:
+    SAVEMx2 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N2_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M4_L7 */
+    beq       ZERO,TL, .L_N2_M4_L7
+.align 5
+.L_N2_M4_TL1: /* TL-- */
+    KERNEL8xMx2 4, 0x10
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M4_TL1
+.L_N2_M4_L7:
+    /* if (!(L & 7)) goto L_N2_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M4_L0
+.align 5
+.L_N2_M4_L71:
+    KERNEL1xMx2 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M4_L71
+.L_N2_M4_L0:
+    SAVEMx2 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N2_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M2_L7 */
+    beq       ZERO,TL, .L_N2_M2_L7
+.align 5
+.L_N2_M2_TL1: /* TL-- */
+    KERNEL8xMx2 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M2_TL1
+.L_N2_M2_L7:
+    /* if (!(L & 7)) goto L_N2_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M2_L0
+.align 5
+.L_N2_M2_L71:
+    KERNEL1xMx2 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M2_L71
+.L_N2_M2_L0:
+    SAVEMx2 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N2_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M1_L7 */
+    beq       ZERO,TL, .L_N2_M1_L7
+.align 5
+.L_N2_M1_TL1: /* TL-- */
+    KERNEL8xMx2 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M1_TL1
+.L_N2_M1_L7:
+    /* if (!(L & 7)) goto L_N2_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M1_L0
+.align 5
+.L_N2_M1_L71:
+    KERNEL1xMx2 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M1_L71
+.L_N2_M1_L0:
+    SAVEMx2 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M0:
+    /* Add stride for B and C
+     * B += 2 * K
+     * C += 2 * LDC
+     */
+    PTR_SLLI    T0,   K,   3
+    PTR_SLLI    T1,   LDC, 3
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x02
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_N1_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M8
+.L_N1_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x1_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M16_L7 */
+    beq       ZERO,TL, .L_N1_M16_L7
+.align 5
+.L_N1_M16_TL1: /* TL-- */
+    KERNEL8x16x1
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M16_TL1
+.L_N1_M16_L7:
+    /* if (!(L & 7)) goto L_N1_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M16_L0
+.align 5
+.L_N1_M16_L71:
+    KERNEL1x16x1
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M16_L71
+.L_N1_M16_L0:
+    SAVE16x1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_M16
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
+.align 5
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8xMx1 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M8_TL1
+.L_N1_M8_L7:
+    /* if (!(L & 7)) goto L_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M8_L0
+.align 5
+.L_N1_M8_L71:
+    KERNEL1xMx1 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M8_L71
+.L_N1_M8_L0:
+    SAVEMx1 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+.align 5
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8xMx1 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+.align 5
+.L_N1_M4_L71:
+    KERNEL1xMx1 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+.L_N1_M4_L0:
+    SAVEMx1 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+.align 5
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8xMx1 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+.align 5
+.L_N1_M2_L71:
+    KERNEL1xMx1 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+.L_N1_M2_L0:
+    SAVEMx1 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+.align 5
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8xMx1 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+.align 5
+.L_N1_M1_L71:
+    KERNEL1xMx1 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    SAVEMx1 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
new file mode 100644
index 000000000..266c07c5c
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+// Loops outline
+//.L_N16 <-------------------
+//|   .L_M8:                |
+//|   .L_M7:                | Main Loop
+//|   .L_M1:                |
+//|   .L_M0: ---------------
+//.L_N15:
+//.L_N8:
+//|   .L_N8_M8:
+//|   .L_N8_M7:
+//|   .L_N8_M1:
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 26, 32
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x04
+    beq        J,    ZERO, .L_N15
+.align 5
+.L_N16:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S3,   TL
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      S9,   S7,   T0
+    PTR_ADD      S10,  S8,   T0
+    PTR_ADD      S11,  S9,   T0
+    PTR_ADD      S12,  S10,  T0
+    PTR_ADD      S13,  S11,  T0
+    PTR_ADD      S14,  S12,  T0
+    PTR_ADD      S15,  S13,  T0
+    PTR_ADD      S16,  S14,  T0
+    PTR_ADD      TS,   S15,  T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
+                    U8, U9, U10, U11, U12, U13, U14, U15, \
+                    U0, U1, U2, U3 // As tmp
+    GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
+              D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    GST xv, , D8,  TD, 0x00, D9,  TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
+              D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     S9,   S9,   0x20
+    PTR_ADDI     S10,  S10,  0x20
+    PTR_ADDI     S11,  S11,  0x20
+    PTR_ADDI     S12,  S12,  0x20
+    PTR_ADDI     S13,  S13,  0x20
+    PTR_ADDI     S14,  S14,  0x20
+    PTR_ADDI     S15,  S15,  0x20
+    PTR_ADDI     S16,  S16,  0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S1,    S1,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    S3,    S3,  0x04
+    PTR_ADDI    S4,    S4,  0x04
+    PTR_ADDI    S5,    S5,  0x04
+    PTR_ADDI    S6,    S6,  0x04
+    PTR_ADDI    S7,    S7,  0x04
+    PTR_ADDI    S8,    S8,  0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    fld.s     F0,    S9,  0x00
+    fld.s     F1,    S10, 0x00
+    fld.s     F2,    S11, 0x00
+    fld.s     F3,    S12, 0x00
+    fld.s     F4,    S13, 0x00
+    fld.s     F5,    S14, 0x00
+    fld.s     F6,    S15, 0x00
+    fld.s     F7,    S16, 0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S9,    S9,  0x04
+    PTR_ADDI    S10,   S10, 0x04
+    PTR_ADDI    S11,   S11, 0x04
+    PTR_ADDI    S12,   S12, 0x04
+    PTR_ADDI    S13,   S13, 0x04
+    PTR_ADDI    S14,   S14, 0x04
+    PTR_ADDI    S15,   S15, 0x04
+    PTR_ADDI    S16,   S16, 0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N16
+.L_N15:
+    andi      J,     N,   0x0f
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N7
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_N8_M7
+.align 5
+.L_N8_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N8_M8
+.L_N8_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N7
+.align 5
+.L_N8_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N8_M1
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..5c173568b
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
@@ -0,0 +1,298 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+#define D10    $xr17
+#define D12    $xr18
+#define D14    $xr19
+
+// Loops outline
+//.L_N8: <----------------
+//|   .L_M8:              |
+//|   .L_M7:              | Main Loop
+//|   .L_M1:              |
+//|   .L_M0:--------------
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 17, 20
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x03
+    beq        J,    ZERO, .L_N7
+.align 5
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N8
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 17, 20
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
new file mode 100644
index 000000000..d9789bdcd
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
@@ -0,0 +1,526 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N16:               |
+//|   .L_N15:               |
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N16:
+//|   .L_M4_N15:
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N16:
+//|   .L_M2_N15:
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N16:
+//|   .L_M1_N15:
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T0
+    PTR_MUL      P3,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_ADD      P3,     DST,   P3
+
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P4,     M,     T0
+    PTR_MUL      P5,     M,     T1
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_SLLI     P5,     P5,    0x02
+    PTR_ADD      P4,     DST,   P4
+    PTR_ADD      P5,     DST,   P5
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x06
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x200
+
+    PTR_SRAI     I,      N,     0x04
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N15
+.L_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     S5,     S5,    0x40
+    PTR_ADDI     S6,     S6,    0x40
+    PTR_ADDI     S7,     S7,    0x40
+    PTR_ADDI     S8,     S8,    0x40
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N16
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
+              U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+    PTR_ADDI     P2,     P2,    0x100
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
+             $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P3,     P3,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
+              $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P4,     P4,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
+              $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P5,     P5,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M4_N15
+.align 5
+.L_M4_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N16
+.L_M4_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M4_N7
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     P2,     P2,    0x80
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P3,     P3,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P4,     P4,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P5,     P5,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M2_N15
+.align 5
+.L_M2_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N16
+.L_M2_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M2_N7
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     P2,     P2,    0x40
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P3,     P3,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P4,     P4,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P5,     P5,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M1_N15
+.align 5
+.L_M1_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N16
+.L_M1_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M1_N7
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P2, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     P2,     P2,    0x20
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P3,     P3,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P5, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P5,     P5,    0x04
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..725a47a60
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
@@ -0,0 +1,406 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define T0     $r27
+#define T1     $r28
+#define TL     $r7
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 23, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P3,     M,     T0
+    PTR_MUL      P4,     M,     T1
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_ADD      P3,     DST,   P3
+    PTR_ADD      P4,     DST,   P4
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x05
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x03
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
+              U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N8
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
+             $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P2,     P2,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
+              $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P3,     P3,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
+              $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P4,     P4,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M4_N7
+.align 5
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N8
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P2,     P2,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P3,     P3,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P4,     P4,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M2_N7
+.align 5
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N8
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P2,     P2,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P3,     P3,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x20
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M1_N7
+.align 5
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P1, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N8
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P2, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P2,     P2,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P3,     P3,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P4,     P4,    0x04
+.L_M0:
+    pop_if_used 23, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
index 57c21a017..8c5c91ade 100644
--- a/kernel/loongarch64/snrm2.S
+++ b/kernel/loongarch64/snrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d I, N, 3
    bne INCX, TEMP, .L20
    bge $r0,    I, .L15
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
index 49f640268..8e2165ab7 100644
--- a/kernel/loongarch64/znrm2.S
+++ b/kernel/loongarch64/znrm2.S
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    MOV s2, s1
    srai.d  I, N, 2
diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/mips/nrm2.c
+++ b/kernel/mips/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c
index 85be39cd1..d11a6bd4a 100644
--- a/kernel/mips/znrm2.c
+++ b/kernel/mips/znrm2.c
@@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S
index 76fa9c295..159f9bea9 100644
--- a/kernel/mips64/cnrm2.S
+++ b/kernel/mips64/cnrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	dsra	I, N, 2
diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S
index cd40414a2..1b55d9fc3 100644
--- a/kernel/mips64/dnrm2.S
+++ b/kernel/mips64/dnrm2.S
@@ -81,7 +81,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	move	XX, X
diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S
index 1ba061a7d..f18151b5c 100644
--- a/kernel/mips64/snrm2.S
+++ b/kernel/mips64/snrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	bne	INCX, TEMP, .L20
diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S
index 1c247bca9..d33284a47 100644
--- a/kernel/mips64/znrm2.S
+++ b/kernel/mips64/znrm2.S
@@ -80,7 +80,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	move	XX, X
diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S
index c115650fd..74117a831 100644
--- a/kernel/power/cnrm2.S
+++ b/kernel/power/cnrm2.S
@@ -99,7 +99,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S
index 46c29c654..0d036b32f 100644
--- a/kernel/power/cnrm2_hummer.S
+++ b/kernel/power/cnrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	andi.	r0, X, 2 * SIZE - 1
 	bne	LL(100)
diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S
index c71c34b7c..8e3abf9f9 100644
--- a/kernel/power/cnrm2_ppc440.S
+++ b/kernel/power/cnrm2_ppc440.S
@@ -104,7 +104,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	sub	X, X, INCX
diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
index ecdc3e5c6..73f6d5b99 100644
--- a/kernel/power/dgemm_small_kernel_nn_power10.c
+++ b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c
index 93a942b02..426948185 100644
--- a/kernel/power/dgemm_small_kernel_tn_power10.c
+++ b/kernel/power/dgemm_small_kernel_tn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S
index 4931f5ab1..8638ca424 100644
--- a/kernel/power/dnrm2_hummer.S
+++ b/kernel/power/dnrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S
index 849ca1f35..529f6adf0 100644
--- a/kernel/power/dnrm2_ppc440.S
+++ b/kernel/power/dnrm2_ppc440.S
@@ -111,7 +111,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/lock.c b/kernel/power/lock.c
index 1c1b006b0..de28680d8 100644
--- a/kernel/power/lock.c
+++ b/kernel/power/lock.c
@@ -36,7 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 #ifdef __GNUC__
 
diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S
index d9e1f4e9a..880b5d1b4 100644
--- a/kernel/power/nrm2.S
+++ b/kernel/power/nrm2.S
@@ -113,7 +113,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c
index 134929ec1..c3fa67cf6 100644
--- a/kernel/power/sbgemm_kernel_power10.c
+++ b/kernel/power/sbgemm_kernel_power10.c
@@ -336,7 +336,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0, acc1;
 	  __builtin_mma_xxsetaccz (&acc0);
@@ -492,7 +491,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
-	      vec_t *rowB = (vec_t *) & (BO[l]);
 	      vector short rowB_mrg =
 		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
 	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
@@ -570,7 +568,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0;
 	  BLASLONG l = 0;
diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S
index be974cc48..696d404bb 100644
--- a/kernel/power/snrm2.S
+++ b/kernel/power/snrm2.S
@@ -97,7 +97,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S
index a0ff3d1b2..a4292af78 100644
--- a/kernel/power/snrm2_hummer.S
+++ b/kernel/power/snrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	cmpwi	cr0, INCX, SIZE
 	bne	LL(100)
diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S
index 0a80d1224..3547d7f47 100644
--- a/kernel/power/snrm2_ppc440.S
+++ b/kernel/power/snrm2_ppc440.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S
index 60f379d25..3048e3480 100644
--- a/kernel/power/znrm2.S
+++ b/kernel/power/znrm2.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S
index 1d0c598f8..4ef2212df 100644
--- a/kernel/power/znrm2_hummer.S
+++ b/kernel/power/znrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S
index 778b805de..f775c3e62 100644
--- a/kernel/power/znrm2_ppc440.S
+++ b/kernel/power/znrm2_ppc440.S
@@ -112,7 +112,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 0da66fa35..e6f2b3314 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -42,8 +42,8 @@ ZSUMKERNEL  = ../arm/zsum.c
 
 SAXPYKERNEL  = axpy_vector.c
 DAXPYKERNEL  = axpy_vector.c
-CAXPYKERNEL  = zaxpy_vector.c
-ZAXPYKERNEL  = zaxpy_vector.c
+CAXPYKERNEL  = zaxpy.c
+ZAXPYKERNEL  = zaxpy.c
 
 SAXPBYKERNEL  = axpby_vector.c
 DAXPBYKERNEL  = axpby_vector.c
@@ -59,6 +59,7 @@ SDOTKERNEL   = dot_vector.c
 DDOTKERNEL   = dot_vector.c
 CDOTKERNEL   = zdot_vector.c
 ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = ../generic/dot.c
 
 SNRM2KERNEL  = nrm2_vector.c
 DNRM2KERNEL  = nrm2_vector.c
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index b778d3e55..1b7799340 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index fd2f83dc9..f9b7defae 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index a82275153..fc73362bc 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 988c57ec2..676dfd474 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 98b9f6814..6f921f2d6 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index a46136d6c..fee5e195d 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index 64efc6c40..cc27d68ed 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
@@ -63,7 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0, j=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
         FLOAT_V_T vr, vx, vy;
         unsigned int gvl = 0;
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index 32ca8618b..bb9ab8e5a 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7683641fa..7d0b70cbb 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index ecb4cd7a9..4242af6ea 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -29,14 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if defined(DOUBLE)
-
-#define ABS fabs
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -54,14 +53,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VADDVX_UINT vadd_vx_u64m8
 #define VMVVX_UINT vmv_v_x_u64m8
 #else
-
-#define ABS fabsf
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -85,7 +83,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
 	FLOAT maxf=0.0;
+#ifdef DOUBLE
+        BLASLONG max_index = 0;
+#else
         unsigned int max_index = 0;
+#endif
 	if (n <= 0 || inc_x <= 0) return(max_index);
 
         FLOAT_V_T vx, v_max;
@@ -117,11 +119,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                 }
                 v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                maxf = VFMVFS_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
-
+#ifdef DOUBLE
+		max_index = *((BLASLONG *)&v_max_index+max_index);
+#else
+		max_index = *((unsigned int *)&v_max_index+max_index);
+#endif
                 if(j < n){
                         gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
@@ -130,7 +135,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -138,7 +143,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
                                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                                 max_index = *((unsigned int*)&v_max_index+max_index);
+#endif
                         }
                 }
         }else{
@@ -165,11 +174,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         idx += inc_v;
                 }
                 v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                maxf = VFMVFS_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                 max_index = *((unsigned int*)&v_max_index+max_index);
-
+#endif
                 if(j < n){
                         gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
@@ -178,7 +190,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -186,11 +198,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
                                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                                 max_index = *((unsigned int*)&v_max_index+max_index);
+#endif
                         }
                 }
         }
 	return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index c72bb94cc..4e81e7848 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index c2d787ab8..ca48a3c48 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index dfe9a3310..2a677098d 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index fdbdc3ae8..66a101566 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 59c720310..818193a9e 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index b988513c9..7f31e9a53 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index be0803df6..14b7e01ed 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/riscv64/nrm2.c
+++ b/kernel/riscv64/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 2a83e2a52..cf6fdb741 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 9b48d1c69..f3786e1d0 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFMSACVF_FLOAT vfmsac_vf_f32m4
@@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFMSACVF_FLOAT vfmsac_vf_f64m4
@@ -155,6 +155,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                 }
         }else{
                 gvl = VSETVL(n);
+		if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 7a3153b7c..8b9ef5a3e 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #define VFMULVF_FLOAT vfmul_vf_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #define VFMULVF_FLOAT vfmul_vf_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #endif
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index d9421e2f1..82fa5ce31 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -136,6 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                 }
         }else{
                 gvl = VSETVL(n);
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 6588f4dda..58ec17b03 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 31104eae6..34ff0e30a 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 9dbeba90f..bfb282ae0 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index dc58075ac..d9eca7f10 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 8386ab62e..0d1cc42f1 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 3eca20415..5e6034ac5 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 303d3541e..4ccfe4a81 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 600f02bba..55a480a35 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #endif
 
 
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index ec38ed9d2..0900206b3 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index b5ee1f054..3095c28f9 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index e930dc2a2..a7a8a5279 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 275ee9131..0a284a999 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 2f46977d4..33b7c9c25 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/riscv64/znrm2.c
+++ b/kernel/riscv64/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 59d0e219d..cadabdb75 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 2fdd8135a..727d13a87 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
@@ -112,6 +112,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		}
 		
 	}else{
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
 		for(i=0,j=0; i < n/gvl; i++){
 			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
 			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 64323aa3a..d275b75f8 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 7550294b5..09cc8992a 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -81,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                 }
         }else{
                 gvl = VSETVL(n);
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 14a339e75..4c361f155 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -49,7 +50,9 @@
 static void init_parameter(void);
 
 gotoblas_t TABLE_NAME = {
-  DTB_DEFAULT_ENTRIES ,
+  DTB_DEFAULT_ENTRIES,
+
+  SWITCH_RATIO,
 
   GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
 
@@ -63,6 +66,7 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   SBGEMM_ALIGN_K,
+  0, // need_amxtile_permission
 
   sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
 
@@ -1806,6 +1810,12 @@ static void init_parameter(void) {
 
 #endif
 
+#ifdef SAPPHIRERAPIDS
+#if (BUILD_BFLOAT16 == 1)
+  TABLE_NAME.need_amxtile_permission = 1;
+#endif
+#endif
+
 #if BUILD_COMPLEX==1
 #ifdef CGEMM3M_DEFAULT_P
   TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S
index 8dc4b56b6..0840c8848 100644
--- a/kernel/sparc/cnrm2.S
+++ b/kernel/sparc/cnrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S
index cf7522953..41e993440 100644
--- a/kernel/sparc/dnrm2.S
+++ b/kernel/sparc/dnrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, BASE_SHIFT, INCX
 
 	add	%sp, -8, %sp
diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S
index a80247259..a7405b6e1 100644
--- a/kernel/sparc/snrm2.S
+++ b/kernel/sparc/snrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, BASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S
index 065d22784..dae53ffe7 100644
--- a/kernel/sparc/znrm2.S
+++ b/kernel/sparc/znrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	add	%sp, -8, %sp
diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S
index 7a14da862..3a6417462 100644
--- a/kernel/x86/nrm2.S
+++ b/kernel/x86/nrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$BASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S
index 0f174c408..129b41a03 100644
--- a/kernel/x86/nrm2_sse.S
+++ b/kernel/x86/nrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	leal	(, INCX, SIZE), INCX
 	cmpl	$SIZE, INCX
diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S
index 263612e9a..7a65df77a 100644
--- a/kernel/x86/znrm2.S
+++ b/kernel/x86/znrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S
index bbc3677ae..4ad326120 100644
--- a/kernel/x86/znrm2_sse.S
+++ b/kernel/x86/znrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 88f574668..3a832e917 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,6 +1,14 @@
 include $(KERNELDIR)/KERNEL.COOPERLAKE
 
-SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c
+SBGEMM_SMALL_M_PERMIT =
+SBGEMM_SMALL_K_NN     =
+SBGEMM_SMALL_K_B0_NN  =
+SBGEMM_SMALL_K_NT     =
+SBGEMM_SMALL_K_B0_NT  =
+SBGEMM_SMALL_K_TN     =
+SBGEMM_SMALL_K_B0_TN  =
+SBGEMM_SMALL_K_TT     =
+SBGEMM_SMALL_K_B0_TT  =
 
 SBGEMM_BETA     = sgemm_beta_skylakex.c
 SBGEMMKERNEL    = sbgemm_kernel_16x16_spr.c
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index b398aa6e1..d261962de 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -1,5 +1,10 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 
 #define HAVE_CASUM_KERNEL 1
 
@@ -347,3 +352,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index 3ca173c20..94e6d2c77 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -24,9 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
-#include <stdlib.h>
-#include <stdio.h>
 #include "common.h"
 
 #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index 3187e196c..f123e81d1 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -24,7 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #include "common.h"
 
diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c
index 8a622427b..a6c012a4c 100644
--- a/kernel/x86_64/cscal_microk_skylakex-2.c
+++ b/kernel/x86_64/cscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c
index fd9da7ebe..bc27c7647 100644
--- a/kernel/x86_64/dasum_microk_haswell-2.c
+++ b/kernel/x86_64/dasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL
 
diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c
index 83bc078b3..76b9fbef0 100644
--- a/kernel/x86_64/dasum_microk_skylakex-2.c
+++ b/kernel/x86_64/dasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL 1
 
diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c
index e785a39f1..5b9147d10 100644
--- a/kernel/x86_64/daxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/daxpy_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c
index 8eabf225a..f076862f7 100644
--- a/kernel/x86_64/ddot_microk_skylakex-2.c
+++ b/kernel/x86_64/ddot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_8 1
 
diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
index a98772b94..da57a18a7 100644
--- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
index 37d1ca497..69ad6d94e 100644
--- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
index 4030399ab..4e8739864 100644
--- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c
+++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_4x4 1
 
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
index 4e862e663..bf9c044d4 100644
--- a/kernel/x86_64/drot_microk_skylakex-2.c
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DROT_KERNEL 1
 
diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c
index e0598272e..381136414 100644
--- a/kernel/x86_64/dscal_microk_skylakex-2.c
+++ b/kernel/x86_64/dscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
index f0df5aaa8..ca4773a4b 100644
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c
index 9b8ac4714..b713b39be 100644
--- a/kernel/x86_64/dtobf16_microk_cooperlake.c
+++ b/kernel/x86_64/dtobf16_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S
index b79ac2adb..61cf8c452 100644
--- a/kernel/x86_64/nrm2.S
+++ b/kernel/x86_64/nrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$BASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S
index 33b1ee496..c1f3a45fc 100644
--- a/kernel/x86_64/nrm2_sse.S
+++ b/kernel/x86_64/nrm2_sse.S
@@ -57,7 +57,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	pxor	%xmm2, %xmm2
 	leaq	(, INCX, SIZE), INCX
diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c
index 2eb5b9538..3b4d65cfc 100644
--- a/kernel/x86_64/sasum_microk_haswell-2.c
+++ b/kernel/x86_64/sasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c
index fbc91b558..f193053ee 100644
--- a/kernel/x86_64/sasum_microk_skylakex-2.c
+++ b/kernel/x86_64/sasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c
index 950f10ba2..bbe4d2bc5 100644
--- a/kernel/x86_64/saxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/saxpy_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c
index 2aefe46ff..ccec98e34 100644
--- a/kernel/x86_64/sbdot_microk_cooperlake.c
+++ b/kernel/x86_64/sbdot_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBDOT_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
index 90e0a32c7..5ee3c8532 100644
--- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
+++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
@@ -97,33 +97,32 @@ typedef struct {
 #define T_C10	6
 #define T_C11	7
 
-// FIXME: gcc11 seem have problem in tile load/store address calc,
-// need to multiply with element size (2 or 4) here.
+
 #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
 #define LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define MASK_LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
 #define LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 #define MASK_LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 
 #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
index d875e0d96..c87f9fa5b 100644
--- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_N_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
index 23da2e809..5b7a2e147 100644
--- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_T_ACCL_KERNEL 1
 
diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c
index 1fcb7f27c..f14632f94 100644
--- a/kernel/x86_64/sdot_microk_skylakex-2.c
+++ b/kernel/x86_64/sdot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index 2366fe3aa..6f4309c30 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
index 308f5e35e..987b090ba 100644
--- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index c9681fa8b..296eded5a 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
index fba9cedcd..199621712 100644
--- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c
+++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 )
+
 
 #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1
 #include "common.h"
@@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS
 }
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 07aa51503..ea89a2aaf 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c
index dca12acfc..d4f675a1e 100644
--- a/kernel/x86_64/sgemv_t_microk_skylakex.c
+++ b/kernel/x86_64/sgemv_t_microk_skylakex.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
index a21d1cf64..aec25ac56 100644
--- a/kernel/x86_64/srot_microk_skylakex-2.c
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SROT_KERNEL 1
 
diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c
index c4fa160f0..5c13cba55 100644
--- a/kernel/x86_64/sscal_microk_skylakex-2.c
+++ b/kernel/x86_64/sscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 45914daf5..4826b00c6 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 26e5ca7e9..06db14ebe 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c
index 2756a6934..e7d20ddfa 100644
--- a/kernel/x86_64/stobf16_microk_cooperlake.c
+++ b/kernel/x86_64/stobf16_microk_cooperlake.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index e257a5456..dddf03fe2 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -1,5 +1,10 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 
 #define HAVE_ZASUM_KERNEL 1
 
@@ -338,3 +343,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif
diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S
index a7dd054fb..3776c8910 100644
--- a/kernel/x86_64/zaxpy_sse2.S
+++ b/kernel/x86_64/zaxpy_sse2.S
@@ -1418,10 +1418,10 @@
 	movq	M,  %rax
 //If incx==0 || incy==0, avoid unloop and jump to end.
 	cmpq	$0, INCX
-	je  .L58
+	jne  	.L59
 	cmpq	$0, INCY
 	je	.L58
-
+.L59:
 	sarq	$3, %rax
 	jle	.L55
 
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 27397ccfa..72a712a9e 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 8fc960610..678cea957 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 63c8b11a4..44d545df7 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S
index 0d2aa3480..748fde310 100644
--- a/kernel/x86_64/znrm2.S
+++ b/kernel/x86_64/znrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$ZBASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S
index f78b83f7e..2274f2e98 100644
--- a/kernel/x86_64/znrm2_sse.S
+++ b/kernel/x86_64/znrm2_sse.S
@@ -58,7 +58,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	xorq	FLAG, FLAG
 
diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c
index f9e05e333..29dc4f6df 100644
--- a/kernel/x86_64/zscal_microk_skylakex-2.c
+++ b/kernel/x86_64/zscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 
diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
index 585ca26e7..17c0df80e 100644
--- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
+++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
@@ -1,4 +1,4 @@
-# This module perdorms several try-compiles to determine the default integer
+# This module performs several try-compiles to determine the default integer
 # size being used by the fortran compiler
 #
 # After execution, the following variables are set.  If they are un set then
diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake
index 384064007..93db45130 100644
--- a/lapack-netlib/CMAKE/Findcodecov.cmake
+++ b/lapack-netlib/CMAKE/Findcodecov.cmake
@@ -36,7 +36,7 @@ function(add_coverage TNAME)
 endfunction()
 
 
-# Find the reuired flags foreach language.
+# Find the required flags foreach language.
 set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET})
 set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
 
@@ -118,7 +118,7 @@ function (codecov_path_of_source FILE RETURN_VAR)
 
   # If expression was found, SOURCEFILE is a generator-expression for an
   # object library. Currently we found no way to call this function automatic
-  # for the referenced target, so it must be called in the directoryso of the
+  # for the referenced target, so it must be called in the directory of the
   # object library definition.
   if(NOT "${_source}" STREQUAL "")
     set(${RETURN_VAR} "" PARENT_SCOPE)
diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index b704e72c5..fefaa8b89 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -1,16 +1,20 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.2)
 
 project(LAPACK Fortran C)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 9)
+set(LAPACK_MINOR_VERSION 11)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
   ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION}
   )
 
-# Add the CMake directory for custon CMake modules
+# Allow setting a prefix for the library names
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+
+# Add the CMake directory for custom CMake modules
 set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})
 
 # Export all symbols on Windows when building shared libraries
@@ -41,6 +45,40 @@ if(_is_coverage_build)
   find_package(codecov)
 endif()
 
+# By default test Fortran compiler complex abs and complex division
+option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
+if( TEST_FORTRAN_COMPILER )
+
+  add_executable( test_zcomplexabs ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+  add_custom_target( run_test_zcomplexabs
+    COMMAND test_zcomplexabs 2> test_zcomplexabs.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexabs in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexabs.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+
+  add_executable( test_zcomplexdiv ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+  add_custom_target( run_test_zcomplexdiv
+    COMMAND test_zcomplexdiv 2> test_zcomplexdiv.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexdiv in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexdiv.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+
+  add_executable( test_zcomplexmult ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+  add_custom_target( run_test_zcomplexmult
+    COMMAND test_zcomplexmult 2> test_zcomplexmult.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexmult in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexmult.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+
+  add_executable( test_zminMax ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+  add_custom_target( run_test_zminMax
+    COMMAND test_zminMax 2> test_zminMax.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+    
+endif()
+
 # By default static library
 option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
 
@@ -89,12 +127,57 @@ configure_file(
 include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
+# Check if recursive flag exists
+include(CheckFortranCompilerFlag)
+if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
+  check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+  check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
+  check_fortran_compiler_flag("-recursive" _recursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
+  check_fortran_compiler_flag("-qrecur" _qrecurFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG)
+  check_fortran_compiler_flag("-recursive" _recursiveFlag)
+else()
+  message(WARNING "Fortran local arrays should be allocated on the stack."
+    " Please use a compiler which guarantees that feature."
+    " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.")
+endif()
+
+# Add recursive flag
+if(_MrecursiveFlag)
+  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_frecursiveFlag)
+  string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_recursiveFlag)
+  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_qrecurFlag)
+  string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+endif()
+
 if(UNIX)
   if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
     set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
   endif()
   if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
   endif()
 # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
 # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
@@ -128,6 +211,22 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
   endif()
 endif()
 
+# Add option to enable flat namespace for symbol resolution on macOS
+if(APPLE)
+  option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
+
+  if(USE_FLAT_NAMESPACE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-flat_namespace")
+  else()
+    if(BUILD_SHARED_LIBS AND BUILD_TESTING)
+      message(WARNING
+        "LAPACK test suite might fail with shared libraries and the default two-level namespace. "
+        "Disable shared libraries or enable flat namespace for symbol resolution via -DUSE_FLAT_NAMESPACE=ON.")
+    endif()
+  endif()
+endif()
 
 # --------------------------------------------------
 set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets)
@@ -149,13 +248,14 @@ option(BUILD_TESTING "Build tests" ${_is_coverage_build})
 include(CTest)
 message(STATUS "Build tests: ${BUILD_TESTING}")
 
-# lapack_testing.py uses features from python 2.7 and greater
 if(BUILD_TESTING)
-  set(_msg "Looking for Python >= 2.7 needed for summary tests")
+  set(_msg "Looking for Python3 needed for summary tests")
   message(STATUS "${_msg}")
-  find_package(PythonInterp 2.7 QUIET)
-  if(PYTHONINTERP_FOUND)
-    message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})")
+  # find_package(PythonInterp 3) cannot be used because /usr/bin/python may be
+  # a Python2 interpreter.
+  find_program(PYTHON_EXECUTABLE python3)
+  if(PYTHON_EXECUTABLE)
+    message(STATUS "${_msg} - found")
   else()
     message(STATUS "${_msg} - not found (skipping summary tests)")
   endif()
@@ -177,7 +277,7 @@ CheckLAPACKCompilerFlags()
 # Check second function
 
 include(CheckTimeFunction)
-set(TIME_FUNC NONE ${TIME_FUNC})
+set(TIME_FUNC NONE)
 CHECK_TIME_FUNCTION(NONE TIME_FUNC)
 CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
 CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
@@ -210,6 +310,7 @@ if(NOT (BUILD_SINGLE OR BUILD_DOUBLE OR BUILD_COMPLEX OR BUILD_COMPLEX16))
   BUILD_SINGLE, BUILD_DOUBLE, BUILD_COMPLEX, BUILD_COMPLEX16.")
 endif()
 
+
 # --------------------------------------------------
 # Subdirectories that need to be processed
 option(USE_OPTIMIZED_BLAS "Whether or not to use an optimized BLAS library instead of included netlib BLAS" OFF)
@@ -325,35 +426,80 @@ option(LAPACKE_WITH_TMG "Build LAPACKE with tmglib routines" OFF)
 if(LAPACKE_WITH_TMG)
   set(LAPACKE ON)
 endif()
-if(BUILD_TESTING OR LAPACKE_WITH_TMG) #already included, avoid double inclusion
+
+# TMGLIB
+# Cache export target
+set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
+if(BUILD_TESTING OR LAPACKE_WITH_TMG)
+  if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if dlatms (part of tmg) is found
+    CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(NOT LAPACK_WITH_TMGLIB_FOUND)
+      # Build and install TMG as part of LAPACKE targets (as opposed to LAPACK
+      # targets)
+      set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets)
+    endif()
+  endif()
   add_subdirectory(TESTING/MATGEN)
 endif()
+# Reset export target
+set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
+unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
 
 if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
 
+
 #-------------------------------------
 # BLAS++ / LAPACK++
 option(BLAS++ "Build BLAS++" OFF)
 option(LAPACK++ "Build LAPACK++" OFF)
- 
- 
+
+
 function(_display_cpp_implementation_msg name)
   string(TOLOWER ${name} name_lc)
   message(STATUS "${name}++ enable")
   message(STATUS "----------------")
   message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library")
   message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.")
-  message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp")
   message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu")
   message(STATUS "----------------")
 endfunction()
-if(BLAS++)
+if (BLAS++)
   _display_cpp_implementation_msg("BLAS")
+  include(ExternalProject)
+  ExternalProject_Add(blaspp
+	  URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  )
+  ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
 endif()
-if(LAPACK++)
+if (LAPACK++)
+	message (STATUS "linking lapack++ against ${LAPACK_LIBRARIES}")
   _display_cpp_implementation_msg("LAPACK")
+  include(ExternalProject)
+  if (BUILD_SHARED_LIBS)
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  )
+  else ()
+# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran"  -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+  )
+  endif()
+  ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
 # --------------------------------------------------
@@ -370,7 +516,7 @@ set(CPACK_MONOLITHIC_INSTALL ON)
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK")
 if(WIN32 AND NOT UNIX)
   # There is a bug in NSI that does not handle full unix paths properly. Make
-  # sure there is at least one set of four (4) backlasshes.
+  # sure there is at least one set of four (4) backslashes.
   set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum")
   set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/lapack")
   set(CPACK_NSIS_CONTACT "lapack@eecs.utk.edu")
@@ -396,10 +542,6 @@ if(NOT LATESTLAPACK_FOUND)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB})
 endif()
 
-if(BUILD_TESTING OR LAPACKE_WITH_TMG)
-  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
-endif()
-
 # Export lapack targets, not including lapacke, from the
 # install tree, if any.
 set(_lapack_config_install_guard_target "")
@@ -424,6 +566,10 @@ if(LAPACKE)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB})
 endif()
 
+if(NOT LAPACK_WITH_TMGLIB_FOUND AND LAPACKE_WITH_TMG)
+  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
+endif()
+
 # Export lapack and lapacke targets from the build tree, if any.
 set(_lapack_config_build_guard_target "")
 if(ALL_TARGETS)
@@ -461,4 +607,114 @@ install(FILES
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION}
   COMPONENT Development
   )
-  
+if (LAPACK++)
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "liblapackpp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+  write_basic_package_version_file(
+    "lapackppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+
+endif()
+if (BLAS++)
+  write_basic_package_version_file(
+    "blasppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "libblaspp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+endif()
+
+# --------------------------------------------------
+# Generate MAN and/or HTML Documentation
+option(BUILD_HTML_DOCUMENTATION "Create and install the HTML based API
+documentation (requires Doxygen) - command: make html" OFF)
+option(BUILD_MAN_DOCUMENTATION "Create and install the MAN based documentation (requires Doxygen) - command: make man" OFF)
+message(STATUS "Build html documentation: ${BUILD_HTML_DOCUMENTATION}")
+message(STATUS "Build man documentation: ${BUILD_MAN_DOCUMENTATION}")
+
+if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
+    find_package(Doxygen)
+    if(NOT DOXYGEN_FOUND)
+         message(WARNING "Doxygen is needed to build the documentation.")
+
+    else()
+
+    set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
+    set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
+    set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
+    set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
+    set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
+    set(DOXYGEN_SOURCE_BROWSER YES)
+    set(DISTRIBUTE_GROUP_DOC YES)
+    set(DOXYGEN_CREATE_SUBDIRS YES)
+    set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
+    set(DOXYGEN_EXTRACT_ALL YES)
+    set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h")
+    set(DOXYGEN_RECURSIVE YES)
+    set(DOXYGEN_GENERATE_TREEVIEW YES)
+    set(DOXYGEN_INTERACTIVE_SVG YES)
+    set(DOXYGEN_QUIET YES)
+    set(DOXYGEN_WARNINGS NO)
+    set(DOXYGEN_GENERATE_HTML NO)
+    set(DOXYGEN_GENERATE_MAN NO)
+
+
+    if (BUILD_HTML_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_HTML YES)
+        set(DOXYGEN_HTML_OUTPUT explore-html)
+        set(DOXYGEN_INLINE_SOURCES YES)
+        set(DOXYGEN_CALL_GRAPH YES)
+        set(DOXYGEN_CALLER_GRAPH YES)
+
+        doxygen_add_docs(
+            html
+            ${PROJECT_SOURCE_DIR}
+            COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
+        )
+    endif()
+    if (BUILD_MAN_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_MAN YES)
+        set(DOXYGEN_EXCLUDE SRC/VARIANTS)
+        set(DOXYGEN_MAN_LINKS YES)
+        set(DOXYGEN_INLINE_SOURCES NO)
+        set(DOXYGEN_CALL_GRAPH NO)
+        set(DOXYGEN_CALLER_GRAPH NO)
+
+        doxygen_add_docs(
+            man
+            ${PROJECT_SOURCE_DIR}
+            COMMENT "Generating man LAPACK documentation"
+        )
+    endif()
+
+    endif()
+endif()
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 794c2a7aa..668ca8d68 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -575,7 +575,7 @@ There are six machine-dependent functions in the test and timing
 package, at least three of which must be installed.  They are
 
 \begin{tabbing}
-MONOMO  \=  DOUBLE PRECYSION  \=  \kill
+MONOMO  \=  DOUBLE PRECISION  \=  \kill
 LSAME   \>  LOGICAL      \> Test if two characters are the same regardless of case \\
 SLAMCH  \>  REAL  \> Determine machine-dependent parameters \\
 DLAMCH  \>  DOUBLE PRECISION \> Determine machine-dependent parameters \\
diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c
index 4b46115fc..631733841 100644
--- a/lapack-netlib/INSTALL/lsametst.c
+++ b/lapack-netlib/INSTALL/lsametst.c
@@ -426,7 +426,7 @@ static integer c__3 = 3;
 /*     December 2016 */
 
 /*  ===================================================================== */
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 *** Error:  LSAME( \002,a1,\002, \002,"
diff --git a/lapack-netlib/INSTALL/secondtst.c b/lapack-netlib/INSTALL/secondtst.c
index 694679bb5..03e7814e9 100644
--- a/lapack-netlib/INSTALL/secondtst.c
+++ b/lapack-netlib/INSTALL/secondtst.c
@@ -422,7 +422,7 @@ static integer c__1000 = 1000;
 
 /* ===================================================================== */
 
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 Time for \002,g10.3,\002 SAXPY ops = "
diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 3e7f9de5b..28f8ad655 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -3323,6 +3323,216 @@ void LAPACK_zgesdd_base(
     #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__)
 #endif
 
+#define LAPACK_cgedmd_base LAPACK_GLOBAL(cgedmd,CGEDMD)
+void LAPACK_cgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* w, lapack_int const* ldw,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__)
+#endif
+
+
+#define LAPACK_dgedmd_base LAPACK_GLOBAL(dgedmd,DGEDMD)
+void LAPACK_dgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int* k, double* reig, double* imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* w, lapack_int const* ldw,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_sgedmd_base LAPACK_GLOBAL(sgedmd,SGEDMD)
+void LAPACK_sgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, float* reig, float *imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* w, lapack_int const* ldw,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_zgedmd_base LAPACK_GLOBAL(zgedmd,ZGEDMD)
+void LAPACK_zgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int *k, lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* w, lapack_int const* ldw,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_cgedmdq_base LAPACK_GLOBAL(cgedmdq,CGEDMDQ)
+void LAPACK_cgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* f, lapack_int const* ldf,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* v, lapack_int const* ldv,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_dgedmdq_base LAPACK_GLOBAL(dgedmdq,DGEDMDQ)
+void LAPACK_dgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    double* f, lapack_int const* ldf,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int* k,
+    double* reig, double *imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* v, lapack_int const* ldv,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_sgedmdq_base LAPACK_GLOBAL(sgedmdq,SGEDMDQ)
+void LAPACK_sgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    float* f, lapack_int const* ldf,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    float* reig, float* imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* v, lapack_int const* ldv,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_zgedmdq_base LAPACK_GLOBAL(zgedmdq,ZGEDMDQ)
+void LAPACK_zgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* f, lapack_int const* ldf,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int const* k,
+    lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* v, lapack_int const* ldv,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__)
+#endif
+
 #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
 lapack_int LAPACK_cgesv(
     lapack_int const* n, lapack_int const* nrhs,
@@ -11517,6 +11727,22 @@ void LAPACK_sorgtsqr_row(
     float* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_dorhr_col LAPACK_GLOBAL(dorhr_col,DORHR_COL)
+void LAPACK_dorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, double* A,
+    lapack_int const* lda, double* T,
+    lapack_int const* ldt, double* D,
+    lapack_int* info );
+
+#define LAPACK_sorhr_col LAPACK_GLOBAL(sorhr_col,SORHR_COL)
+void LAPACK_sorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, float* A,
+    lapack_int const* lda, float* T,
+    lapack_int const* ldt, float* D,
+    lapack_int* info );
+
 #define LAPACK_dormbr_base LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr_base(
     char const* vect, char const* side, char const* trans,
@@ -21501,7 +21727,7 @@ void LAPACK_ztrevc_base(
 #endif
 );
 #ifdef LAPACK_FORTRAN_STRLEN_END
-    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, 1, 1)
+    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, (size_t)1, 1)
 #else
     #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__)
 #endif
@@ -22708,6 +22934,22 @@ void LAPACK_zungtsqr_row(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cunhr_col LAPACK_GLOBAL(cunhr_col,CUNHR_COL)
+void LAPACK_cunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_float* A,
+    lapack_int const* lda, lapack_complex_float* T,
+    lapack_int const* ldt, lapack_complex_float* D,
+    lapack_int* info );
+
+#define LAPACK_zunhr_col LAPACK_GLOBAL(zunhr_col,ZUNHR_COL)
+void LAPACK_zunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_double* A,
+    lapack_int const* lda, lapack_complex_double* T,
+    lapack_int const* ldt, lapack_complex_double* D,
+    lapack_int* info );
+
 #define LAPACK_cunmbr_base LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr_base(
     char const* vect, char const* side, char const* trans,
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 9998b1504..377e2a6bb 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -956,7 +956,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr,
                            lapack_int lda, double* s, lapack_complex_double* u,
                            lapack_int ldu, lapack_complex_double* v,
                            lapack_int ldv, lapack_int* numrank );
-                           
+
 lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv,
                            lapack_int m, lapack_int n, float* a, lapack_int lda,
                            float* sva, lapack_int mv, float* v, lapack_int ldv,
@@ -5712,6 +5712,122 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m,
                                 lapack_complex_double* work, lapack_int lwork,
                                 double* rwork, lapack_int* iwork );
 
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double *imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_float* x, lapack_int ldx,
+				lapack_complex_float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	lapack_complex_float* eigs,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* res,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* w, lapack_int ldw,
+                                lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x, lapack_int ldx,
+				lapack_complex_double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k, 
+				lapack_complex_double* eigs,
+                                lapack_complex_double* z, lapack_int ldz,
+                                double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+				 float* reig, float *imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+				 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
+                                 lapack_complex_float* z, lapack_int ldz,
+                                 float* res,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* v, lapack_int ldv,
+                                 lapack_complex_float* s, lapack_int lds,
+                                 lapack_complex_float* zwork, lapack_int lzwork,
+                                 float* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
+                                 lapack_complex_double* z, lapack_int ldz,
+                                 double* res,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* v, lapack_int ldv,
+                                 lapack_complex_double* s, lapack_int lds,
+                                 lapack_complex_double* zwork, lapack_int lzwork,
+                                 double* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
+
 lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs,
                                float* a, lapack_int lda, lapack_int* ipiv,
                                float* b, lapack_int ldb );
@@ -5833,7 +5949,7 @@ lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp,
                                 lapack_int* iwork, lapack_int liwork,
                                 lapack_complex_double* cwork, lapack_int lcwork,
                                 double* rwork, lapack_int lrwork);
-                                
+
 lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu,
                                 char jobv, lapack_int m, lapack_int n, float* a,
                                 lapack_int lda, float* sva, lapack_int mv,
@@ -12550,7 +12666,7 @@ lapack_int LAPACKE_zhegv_2stage_work( int matrix_layout, lapack_int itype, char
 //LAPACK 3.8.0
 lapack_int LAPACKE_ssysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12560,7 +12676,7 @@ lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
 lapack_int LAPACKE_dsysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12612,10 +12728,10 @@ lapack_int LAPACKE_zhesv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb,
                                lapack_complex_double* work, lapack_int lwork );
-                               
+
 lapack_int LAPACKE_ssytrf_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2 );
 lapack_int LAPACKE_ssytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                float* a, lapack_int lda,
@@ -12671,7 +12787,7 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 
 lapack_int LAPACKE_ssytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12680,7 +12796,7 @@ lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 lapack_int LAPACKE_dsytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12727,7 +12843,40 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                                lapack_int lda, lapack_complex_double* tb,
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb );
-                   
+//LAPACK 3.10.0
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d );
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d );
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d );
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d );
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d );
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d );
+
 /* APIs for set/get nancheck flags */
 void LAPACKE_set_nancheck( int flag );
 int LAPACKE_get_nancheck( void );
diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt
index 4171a3bd4..89890f923 100644
--- a/lapack-netlib/LAPACKE/src/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt
@@ -589,6 +589,8 @@ lapacke_cungtr.c
 lapacke_cungtr_work.c
 lapacke_cungtsqr_row.c
 lapacke_cungtsqr_row_work.c
+lapacke_cunhr_col.c
+lapacke_cunhr_col_work.c
 lapacke_cunmbr.c
 lapacke_cunmbr_work.c
 lapacke_cunmhr.c
@@ -857,6 +859,8 @@ lapacke_dorgtr.c
 lapacke_dorgtr_work.c
 lapacke_dorgtsqr_row.c
 lapacke_dorgtsqr_row_work.c
+lapacke_dorhr_col.c
+lapacke_dorhr_col_work.c
 lapacke_dormbr.c
 lapacke_dormbr_work.c
 lapacke_dormhr.c
@@ -1432,6 +1436,8 @@ lapacke_sorgtr.c
 lapacke_sorgtr_work.c
 lapacke_sorgtsqr_row.c
 lapacke_sorgtsqr_row_work.c
+lapacke_sorhr_col.c
+lapacke_sorhr_col_work.c
 lapacke_sormbr.c
 lapacke_sormbr_work.c
 lapacke_sormhr.c
@@ -2346,6 +2352,8 @@ lapacke_zungtr.c
 lapacke_zungtr_work.c
 lapacke_zungtsqr_row.c
 lapacke_zungtsqr_row_work.c
+lapacke_zunhr_col.c
+lapacke_zunhr_col_work.c
 lapacke_zunmbr.c
 lapacke_zunmbr_work.c
 lapacke_zunmhr.c
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 9c02c1445..969288f42 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -28,7 +28,6 @@
 ##############################################################################
 # Contents: Native C interface to LAPACK
 # Author: Intel Corporation
-# September, 2010
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
@@ -138,6 +137,10 @@ lapacke_cgerqf.o \
 lapacke_cgerqf_work.o \
 lapacke_cgesdd.o \
 lapacke_cgesdd_work.o \
+lapacke_cgedmd.o \
+lapacke_cgedmd_work.o \
+lapacke_cgedmdq.o \
+lapacke_cgedmdq_work.o \
 lapacke_cgesv.o \
 lapacke_cgesv_work.o \
 lapacke_cgesvd.o \
@@ -608,6 +611,8 @@ lapacke_ctrsna.o \
 lapacke_ctrsna_work.o \
 lapacke_ctrsyl.o \
 lapacke_ctrsyl_work.o \
+lapacke_ctrsyl3.o \
+lapacke_ctrsyl3_work.o \
 lapacke_ctrtri.o \
 lapacke_ctrtri_work.o \
 lapacke_ctrtrs.o \
@@ -640,6 +645,8 @@ lapacke_cungtr.o \
 lapacke_cungtr_work.o \
 lapacke_cungtsqr_row.o \
 lapacke_cungtsqr_row_work.o \
+lapacke_cunhr_col.o \
+lapacke_cunhr_col_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@@ -760,6 +767,10 @@ lapacke_dgerqf.o \
 lapacke_dgerqf_work.o \
 lapacke_dgesdd.o \
 lapacke_dgesdd_work.o \
+lapacke_dgedmd.o \
+lapacke_dgedmd_work.o \
+lapacke_dgedmdq.o \
+lapacke_dgedmdq_work.o \
 lapacke_dgesv.o \
 lapacke_dgesv_work.o \
 lapacke_dgesvd.o \
@@ -912,6 +923,8 @@ lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
 lapacke_dorgtsqr_row.o \
 lapacke_dorgtsqr_row_work.o \
+lapacke_dorhr_col.o \
+lapacke_dorhr_col_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@@ -1224,6 +1237,8 @@ lapacke_dtrsna.o \
 lapacke_dtrsna_work.o \
 lapacke_dtrsyl.o \
 lapacke_dtrsyl_work.o \
+lapacke_dtrsyl3.o \
+lapacke_dtrsyl3_work.o \
 lapacke_dtrtri.o \
 lapacke_dtrtri_work.o \
 lapacke_dtrtrs.o \
@@ -1336,6 +1351,10 @@ lapacke_sgerqf.o \
 lapacke_sgerqf_work.o \
 lapacke_sgesdd.o \
 lapacke_sgesdd_work.o \
+lapacke_sgedmd.o \
+lapacke_sgedmd_work.o \
+lapacke_sgedmdq.o \
+lapacke_sgedmdq_work.o \
 lapacke_sgesv.o \
 lapacke_sgesv_work.o \
 lapacke_sgesvd.o \
@@ -1486,6 +1505,8 @@ lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
 lapacke_sorgtsqr_row.o \
 lapacke_sorgtsqr_row_work.o \
+lapacke_sorhr_col.o \
+lapacke_sorhr_col_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@@ -1794,6 +1815,8 @@ lapacke_strsna.o \
 lapacke_strsna_work.o \
 lapacke_strsyl.o \
 lapacke_strsyl_work.o \
+lapacke_strsyl3.o \
+lapacke_strsyl3_work.o \
 lapacke_strtri.o \
 lapacke_strtri_work.o \
 lapacke_strtrs.o \
@@ -1902,6 +1925,10 @@ lapacke_zgerqf.o \
 lapacke_zgerqf_work.o \
 lapacke_zgesdd.o \
 lapacke_zgesdd_work.o \
+lapacke_zgedmd.o \
+lapacke_zgedmd_work.o \
+lapacke_zgedmdq.o \
+lapacke_zgedmdq_work.o \
 lapacke_zgesv.o \
 lapacke_zgesv_work.o \
 lapacke_zgesvd.o \
@@ -2372,6 +2399,8 @@ lapacke_ztrsna.o \
 lapacke_ztrsna_work.o \
 lapacke_ztrsyl.o \
 lapacke_ztrsyl_work.o \
+lapacke_ztrsyl3.o \
+lapacke_ztrsyl3_work.o \
 lapacke_ztrtri.o \
 lapacke_ztrtri_work.o \
 lapacke_ztrtrs.o \
@@ -2404,6 +2433,8 @@ lapacke_zungtr.o \
 lapacke_zungtr_work.o \
 lapacke_zungtsqr_row.o \
 lapacke_zungtsqr_row_work.o \
+lapacke_zunhr_col.o \
+lapacke_zunhr_col_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
@@ -2544,7 +2575,7 @@ $(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTEND
 ifdef BUILD_DEPRECATED
 	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
-ifdef (USEXBLAS)
+ifdef USEXBLAS
 	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
new file mode 100644
index 000000000..6c77e199e
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
@@ -0,0 +1,127 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+		 	   lapack_int n, lapack_complex_float* x,
+			   lapack_int ldx, lapack_complex_float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol,
+			   lapack_int k, lapack_complex_float* eigs,
+                           lapack_complex_float* z, lapack_int ldz,
+                           float* res, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* w,
+                           lapack_int ldw, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float zwork_query;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -20;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, 
+				lzwork, &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lzwork  = LAPACK_C2INT( zwork_query );
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
new file mode 100644
index 000000000..08d8b91f5
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
@@ -0,0 +1,184 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* y, lapack_int ldy, lapack_int nrnk,
+                                float* tol, lapack_int k, lapack_complex_float* eigs,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* res, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* w,
+                                lapack_int ldw, lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+ 	               &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork,
+		       &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* w_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, 
+			   work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
new file mode 100644
index 000000000..b0b258f97
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_float* f,
+                            lapack_int ldf, lapack_complex_float* x,
+                            lapack_int ldx, lapack_complex_float* y,
+                            lapack_int ldy, lapack_int nrnk, float* tol,
+                            lapack_int k, lapack_complex_float* eigs,
+                            lapack_complex_float* z, lapack_int ldz,
+                            float* res, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* v,
+                            lapack_int ldv, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float zwork_query;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork,
+				 &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lzwork  = LAPACK_C2INT( zwork_query );
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork,
+				 work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
new file mode 100644
index 000000000..05287c1bc
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
+                                 lapack_complex_float* z,
+                                 lapack_int ldz, float* res,
+                                 lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* v,
+                                 lapack_int ldv, lapack_complex_float* s,
+                                 lapack_int lds, lapack_complex_float *zwork,
+				 lapack_int lzwork, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* f_t = NULL;
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* v_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lzwork == -1 || lwork == -1 || liwork == -1 ) {
+            LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
index 962624d21..3f8f0cf17 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_cgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
index ed12b476e..aac7b551d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
index 545769b83..67bbbd34f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
new file mode 100644
index 000000000..7ed1ad4c4
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_cunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_cunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
new file mode 100644
index 000000000..76b8366f0
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
new file mode 100644
index 000000000..6802378da
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, double* x, lapack_int ldx, double* y,
+			   lapack_int ldy, lapack_int nrnk, double* tol, 
+			   lapack_int k, double* reig, double* imeig,
+			   double* z, lapack_int ldz,
+			   double* res, double* b, lapack_int ldb,
+                           double* w, lapack_int ldw, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
new file mode 100644
index 000000000..987709a1b
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
@@ -0,0 +1,181 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double* imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* w_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+			   y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w,
+			   &ldw, s, &lds, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (double*)LAPACKE_malloc( sizeof(double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t,
+		       y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t,
+		       &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork,
+		       iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
new file mode 100644
index 000000000..5c3c39308
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, double* f, lapack_int ldf,
+                            double* x, lapack_int ldx, double* y, lapack_int ldy,
+                            lapack_int nrnk, double* tol, lapack_int k,
+                            double* reig, double* imeig, double* z,
+                            lapack_int ldz, double* res, double* b, lapack_int ldb,
+                            double* v, lapack_int ldv, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
new file mode 100644
index 000000000..149e6d24f
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* f_t = NULL;
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* v_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (double*)LAPACKE_malloc( sizeof(double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (double*)LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
index 7796edffc..0460b6406 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
index 8a4c7cead..000c94e0b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
@@ -80,7 +80,7 @@ lapack_int LAPACKE_dgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
index f4ddc62a5..aeebd8dec 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
index de444c146..de2f41e66 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
new file mode 100644
index 000000000..1f37725e9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_dorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_dorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
new file mode 100644
index 000000000..28b80cc02
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        double* a_t = NULL;
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)
+            LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)
+            LAPACKE_malloc( sizeof(double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
new file mode 100644
index 000000000..6865fcf65
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, float* x, lapack_int ldx, float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol, 
+			   lapack_int k, float* reig, float* imeig,
+			   float* z, lapack_int ldz, float* res,
+			   float* b, lapack_int ldb, float* w, lapack_int ldw,
+			   float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
new file mode 100644
index 000000000..5b24152da
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
@@ -0,0 +1,182 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+				char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k, 
+				float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* w_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, work, &lwork, iwork,
+			   &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (float*)LAPACKE_malloc( sizeof(float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+	    	       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work,
+		       &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
new file mode 100644
index 000000000..e65c2094f
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, float* f, lapack_int ldf,
+                            float* x, lapack_int ldx, float* y, lapack_int ldy,
+                            lapack_int nrnk, float* tol, lapack_int k,
+                            float* reig, float* imeig, float* z,
+                            lapack_int ldz, float* res, float* b, lapack_int ldb,
+                            float* v, lapack_int ldv, float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
new file mode 100644
index 000000000..e1c1f5c98
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 float* reig, float* imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* f_t = NULL;
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* v_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (float*)LAPACKE_malloc( sizeof(float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
index f7be44297..d2555ecc8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
index 00ca6e501..d357845ae 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_sgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
index d36958f93..3d6c29f88 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
index 8b6127633..72a392a77 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
new file mode 100644
index 000000000..60e6e7951
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_sorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_sorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
new file mode 100644
index 000000000..56d6a965e
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        float* a_t = NULL;
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)
+            LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)
+            LAPACKE_malloc( sizeof(float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
new file mode 100644
index 000000000..e4ea4fe10
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
@@ -0,0 +1,128 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, lapack_complex_double* x,
+			   lapack_int ldx, lapack_complex_double* y,
+			   lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                           lapack_complex_double* eigs, lapack_complex_double* z,
+                           lapack_int ldz, double* res,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* zw, lapack_int lzw,
+                           lapack_complex_double* w, lapack_int ldw,
+                           lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    lapack_complex_double zwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork,
+				&work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    liwork = iwork_query;
+    lzwork = LAPACK_Z2INT( zwork_query );
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
new file mode 100644
index 000000000..ebacfaa94
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
@@ -0,0 +1,186 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* y,
+                                lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                                lapack_complex_double* eigs, lapack_complex_double* z,
+                                lapack_int ldz, double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+		       y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w,
+		       &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, 
+		       &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* w_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, 
+			   &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
new file mode 100644
index 000000000..368d48e20
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_double* f,
+                            lapack_int ldf, lapack_complex_double* x,
+                            lapack_int ldx, lapack_complex_double* y,
+                            lapack_int ldy, lapack_int nrnk, double* tol,
+                            lapack_int k, lapack_complex_double* eigs,
+                            lapack_complex_double* z, lapack_int ldz,
+                            double* res, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* v,
+                            lapack_int ldv, lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_complex_double zwork_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork, 
+				 &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    lzwork  = LAPACK_Z2INT( zwork_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork, 
+				 work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
new file mode 100644
index 000000000..131e4f9ad
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
+                                 lapack_complex_double* z,
+                                 lapack_int ldz, double* res,
+                                 lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* v,
+                                 lapack_int ldv, lapack_complex_double* s,
+                                 lapack_int lds, lapack_complex_double* zwork,
+				 lapack_int lzwork, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* f_t = NULL;
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* v_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
index 8f6f1ed0b..633111533 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_zgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
index 85355b202..c5edbbc0e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
index 72d85ec82..232c8ef58 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
new file mode 100644
index 000000000..7e2507daf
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_zunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_zunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
new file mode 100644
index 000000000..b5e640177
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_double* a_t = NULL;
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
index 2adf71493..b32fc2f9e 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
index 6624936a6..89e421eae 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
index 046fdb9ca..a90c9617a 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
index 32a9de379..63323b1c3 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation
+         * This is just reference implementation
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.f b/lapack-netlib/SRC/DEPRECATED/cgegs.f
index 1f0791a20..b6adf9111 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.f b/lapack-netlib/SRC/DEPRECATED/cgegv.f
index ba810ddef..d2b254255 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.f b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
index e6fdb47e5..515ac8d49 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
@@ -107,7 +107,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.f b/lapack-netlib/SRC/DEPRECATED/dgegs.f
index 0ac0112c2..02e9fdcb2 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.f b/lapack-netlib/SRC/DEPRECATED/dgegv.f
index 7e81c85a9..0b5c48922 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.f b/lapack-netlib/SRC/DEPRECATED/sgegs.f
index 2ed9ad942..11ecc67ac 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEGS computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.f b/lapack-netlib/SRC/DEPRECATED/sgegv.f
index 7a179a499..97556e371 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.f b/lapack-netlib/SRC/DEPRECATED/zgegs.f
index c5cdd26e5..23f8d43d1 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.f b/lapack-netlib/SRC/DEPRECATED/zgegv.f
index aa4ab3f71..542d3f4ff 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.f b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
index c0b9247a6..8a41e36c6 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
@@ -106,7 +106,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 8cac42330..c75fd5f49 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -207,7 +207,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o slatrs3.o strsyl3.o sgelst.o
+   sgesvdq.o slatrs3.o strsyl3.o sgelst.o sgedmd.o sgedmdq.o
    
 endif
 
@@ -280,7 +280,7 @@ CLASRC_O = \
    cposv.o  cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \
    cppcon.o cppequ.o cpprfs.o cppsv.o  cppsvx.o cpptrf.o cpptri.o cpptrs.o \
    cptcon.o cpteqr.o cptrfs.o cptsv.o  cptsvx.o cpttrf.o cpttrs.o cptts2.o \
-   crot.o   cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
+   crot.o crscl.o cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
    cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \
    cstegr.o cstein.o csteqr.o \
    csycon.o csymv.o \
@@ -316,7 +316,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o
+   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o cgedmd.o cgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -417,7 +417,7 @@ DLASRC_O = \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
    dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
-   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o
+   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o dgedmd.o dgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -488,7 +488,7 @@ ZLASRC_O = \
    zposv.o  zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \
    zppcon.o zppequ.o zpprfs.o zppsv.o  zppsvx.o zpptrf.o zpptri.o zpptrs.o \
    zptcon.o zpteqr.o zptrfs.o zptsv.o  zptsvx.o zpttrf.o zpttrs.o zptts2.o \
-   zrot.o   zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
+   zrot.o zrscl.o  zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
    zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \
    zstegr.o zstein.o zsteqr.o \
    zsycon.o zsymv.o \
@@ -526,7 +526,7 @@ ZLASRC_O = \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
    zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
-   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o
+   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o zgedmd.o zgedmdq.o
 endif
 
 ifdef USEXBLAS
diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 25d8ee175..35e50cbc2 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -28,7 +28,7 @@ LULL = lu/LL/cgetrf.o lu/LL/dgetrf.o lu/LL/sgetrf.o lu/LL/zgetrf.o
 
 LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o
 
-QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
+QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o
 
 
 .PHONY: all
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
index 16d250c3f..1b8e53cc2 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real Hermitian
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
index 1575bd95c..e0a621b2e 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
index 67ebae335..f3b66a9e3 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
index 76d8bab61..dda42faf9 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real Hermitian
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
index d6149c08f..b9dffa4ec 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real symmetric
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
index e49200ea5..e68a559ba 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
index 65895502b..7401f8844 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
index f8b9e253c..80b7c7f43 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real symmetric
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX*16 array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
index 46eaf33b9..743731a00 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b CGEQRF VARIANT: left-looking Level 3 BLAS version of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> CGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,11 @@ C>
       EXTERNAL           CGEQR2, CLARFB, CLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
index 55cab8b23..bbdd46113 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
@@ -172,12 +172,11 @@ C>
       EXTERNAL           DGEQR2, DLARFB, DLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
deleted file mode 100644
index a007360ba..000000000
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
+++ /dev/null
@@ -1,86 +0,0 @@
-C> \brief \b SCEIL
-*
-*  =========== DOCUMENTATION ===========
-*
-* Online html documentation available at
-*            http://www.netlib.org/lapack/explore-html/
-*
-*  Definition:
-*  ===========
-*
-*       REAL FUNCTION SCEIL( A )
-*
-*       .. Scalar Arguments ..
-*       REAL A
-*       ..
-*
-*    =====================================================================
-*
-*       .. Intrinsic Functions ..
-* 	      INTRINSIC          INT
-*       ..
-*       .. Executable Statements ..*
-*
-*       IF (A-INT(A).EQ.0) THEN
-*           SCEIL = A
-*       ELSE IF (A.GT.0) THEN
-*           SCEIL = INT(A)+1;
-*       ELSE
-*           SCEIL = INT(A)
-*       END IF
-*
-*       RETURN
-*
-*       END
-*  Purpose
-*  =======
-*
-C>\details \b Purpose:
-C>\verbatim
-C>\endverbatim
-*
-*  Arguments:
-*  ==========
-*
-*
-*  Authors:
-*  ========
-*
-C> \author Univ. of Tennessee
-C> \author Univ. of California Berkeley
-C> \author Univ. of Colorado Denver
-C> \author NAG Ltd.
-*
-C> \date December 2016
-*
-C> \ingroup variantsOTHERcomputational
-*
-*  =====================================================================
-      REAL FUNCTION SCEIL( A )
-*
-*  -- LAPACK computational routine --
-*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
-*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*
-*     .. Scalar Arguments ..*
-      REAL A
-*     ..
-*
-*  =====================================================================
-*
-*     .. Intrinsic Functions ..
-	      INTRINSIC          INT
-*     ..
-*     .. Executable Statements ..*
-*
-      IF (A-INT(A).EQ.0) THEN
-          SCEIL = A
-      ELSE IF (A.GT.0) THEN
-          SCEIL = INT(A)+1;
-      ELSE
-          SCEIL = INT(A)
-      END IF
-
-      RETURN
-*
-      END
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
index d2ad13ced..bf68d635b 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
@@ -172,12 +172,11 @@ C>
       EXTERNAL           SGEQR2, SLARFB, SLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
index 623b88a8a..06918568e 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b ZGEQRF VARIANT: left-looking Level 3 BLAS of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,11 @@ C>
       EXTERNAL           ZGEQR2, ZLARFB, ZLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f
index 1d7c4e09d..40706644e 100644
--- a/lapack-netlib/SRC/cbdsqr.f
+++ b/lapack-netlib/SRC/cbdsqr.f
@@ -259,7 +259,7 @@
      $                   NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +372,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -426,7 +426,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +434,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +515,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +543,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +560,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/cgebal.f b/lapack-netlib/SRC/cgebal.f
index 5d1ebb026..3f54d3937 100644
--- a/lapack-netlib/SRC/cgebal.f
+++ b/lapack-netlib/SRC/cgebal.f
@@ -85,6 +85,7 @@
 *> \verbatim
 *>          ILO is INTEGER
 *> \endverbatim
+*>
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
@@ -154,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -183,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -195,10 +199,10 @@
       EXTERNAL           SISNAN, LSAME, ICAMAX, SLAMCH, SCNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CSSCAL, CSWAP, XERBLA
+      EXTERNAL           XERBLA, CSSCAL, CSWAP
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, AIMAG, MAX, MIN, REAL
+      INTRINSIC          ABS, REAL, AIMAG, MAX, MIN
 *
 *     Test the input parameters
 *
@@ -216,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL CSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL CSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( REAL( A( J, I ) ).NE.ZERO .OR. AIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL CSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL CSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL CSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL CSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( REAL( A( I, J ) ).NE.ZERO .OR. AIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SCNRM2( L-K+1, A( K, I ), 1 )
-         R = SCNRM2( L-K+1, A( I , K ), LDA )
-         ICA = ICAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ICAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'CGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL CSSCAL( L, F, A( 1, I ), 1 )
+            C = SCNRM2( L-K+1, A( K, I ), 1 )
+            R = SCNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ICAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ICAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'CGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL CSSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c
new file mode 100644
index 000000000..570395c7b
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.c
@@ -0,0 +1,1670 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int cgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, complex *x, integer *ldx, 
+	complex *y, integer *ldy, integer *nrnk, real *tol, integer *k, 
+	complex *eigs, complex *z__, integer *ldz, real *res, complex *b, 
+	integer *ldb, complex *w, integer *ldw, complex *s, integer *lds, 
+	complex *zwork, integer *lzwork, real *rwork, integer *lrwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Local variables */
+    complex zone;
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    integer i__, j;
+    real scale;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgeev_(char *, 
+	    char *, integer *, complex *, integer *, complex *, complex *, 
+	    integer *, complex *, integer *, complex *, integer *, real *, 
+	    integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    real small;
+    char jobzl[1];
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, 
+	    integer *, complex *, integer *);
+    logical wntex;
+    complex zzero;
+    extern real scnrm2_(integer *, complex *, integer *);
+    extern /* Subroutine */ int cgesdd_(char *, integer *, integer *, complex 
+	    *, integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *, integer *), 
+	    clascl_(char *, integer *, integer *, real *, real *, integer *, 
+	    integer *, complex *, integer *, integer *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *), cgesvd_(char *, char *, integer *, integer *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *), clacpy_(
+	    char *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *), xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int cgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, complex *, integer *, real *, 
+	    complex *, integer *, complex *, integer *, complex *, integer *, 
+	    real *, integer *, integer *, integer *), classq_(integer *, complex *, integer *, 
+	    real *, real *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    real rdummy[2];
+    extern /* Subroutine */ int mecago_();
+    integer lwrsvq, mwrsvq;
+    real ofl, one;
+    extern /* Subroutine */ int cgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, complex *, integer *, real *, complex *, 
+	    integer *, complex *, integer *, integer *, integer *, integer *, 
+	    complex *, integer *, real *, integer *, integer *);
+
+/*   March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), */
+/*     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where */
+/*     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.f;
+		zwork[1].r = 2.f, zwork[1].i = 0.f;
+		zwork[2].r = 2.f, zwork[2].i = 0.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the CGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+/* LAPACK CALL */
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	    olwork = f2cmax(2,olwork);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (real) mlrwrk;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O") * slamch_("P");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using CLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = SCNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("CGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (c_abs(&y[icamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    csscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using CLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = SCNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPA */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK */
+	    } else if (c_abs(&x[icamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	clacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	clacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case CGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    clascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / rwork[i__];
+	    csscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that CGESVD, CGESVDQ and CGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		q__2.r = rwork[i__4], q__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		q__1.r = q__2.r * w[i__5].r - q__2.i * w[i__5].i, q__1.i = 
+			q__2.r * w[i__5].i + q__2.i * w[i__5].r;
+		w[i__3].r = q__1.r, w[i__3].i = q__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside CGEDMD). */
+	cgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	cgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	cgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+
+/* B */
+	cgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		clacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		clacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    cgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of CGEEV. */
+/* LAPACK CA */
+    if (info1 > 0) {
+/* CGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		cgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		clacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+/* CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    cgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRIN */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		q__1.r = -eigs[i__2].r, q__1.i = -eigs[i__2].i;
+		caxpy_(m, &q__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! */
+
+		res[i__] = scnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* cgedmd_ */
+
diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90
new file mode 100644
index 000000000..499489270
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.f90
@@ -0,0 +1,995 @@
+      SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!   March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     CGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, CGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, CGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!......................................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: CGESVD (the QR SVD algorithm)
+!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!   < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, Y contains the data snapshot matrix Y
+!   < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient.
+!     See the descriptions of EIGS, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs,
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.
+!.....
+!     B (output) COMPLEX(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1:K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient.
+!     The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by CGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!     ZWORK is used as complex workspace in the complex SVD, as
+!     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
+!     the eigenvalues of a Rayleigh quotient.
+!     If the call to CGEDMD is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length.
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.
+!.....
+!     LZWORK (input) INTEGER
+!     The minimal length of the workspace vector ZWORK.
+!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
+!     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
+!     LZWORK_SVD is calculated as follows
+!     If WHTSVD == 1 :: CGESVD ::
+!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!     If WHTSVD == 2 :: CGESDD ::
+!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!     If WHTSVD == 3 :: CGESVDQ ::
+!        LZWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: CGEJSV ::
+!        LZWORK_SVD = obtainable by a query
+!     If on entry LZWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths and returns them in
+!     LZWORK(1) and LZWORK(2), respectively.
+!.....
+!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!     On exit, RWORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to CGEDMD is only workspace query, then
+!     RWORK(1) contains the minimal workspace length.
+!     See the description of LRWORK.
+!.....
+!     LRWORK (input) INTEGER
+!     The minimal length of the workspace vector RWORK.
+!     LRWORK is calculated as follows:
+!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
+!     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!     for the SVD subroutine determined by the input parameter
+!     WHTSVD.
+!     If WHTSVD == 1 :: CGESVD ::
+!        LRWORK_SVD = 5*MIN(M,N)
+!     If WHTSVD == 2 :: CGESDD ::
+!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!     If WHTSVD == 3 :: CGESVDQ ::
+!        LRWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: CGEJSV ::
+!        LRWORK_SVD = obtainable by a query
+!     If on entry LRWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     real workspace length and returns it in RWORK(1).
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for  ZWORK, RWORK and
+!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,   &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2
+      EXTERNAL      CLANGE, SLAMCH, SCNRM2, ICAMAX
+      INTEGER                               ICAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CAXPY,  CGEMM,  CSSCAL
+      EXTERNAL      CGEEV,  CGEJSV, CGESDD, CGESVD, CGESVDQ, &
+                    CLACPY, CLASCL, CLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     FLOAT, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL CGESDD( 'O', M, N, X, LDX, RWORK, B,     &
+                     LDB, W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX(MWRSDD,INT( ZWORK(1) ))
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the CGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL CGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) ! LAPACK CALL
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+                OLWORK = MAX( 2, OLWORK )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')*SLAMCH('P')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using CLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = SCNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )             ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(ICAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL CSSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using CLASSQ.
+          DO i = 1, N
+            !RWORK(i) = SCNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(ICAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL CLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL CLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case CGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL CLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL CSSCAL( N, ONE/RWORK(i), W(1,i), 1 )   ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that CGESVD, CGESVDQ and CGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside CGEDMD).
+          CALL CGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL CGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                       ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL CGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                  ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL CGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                        ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL CLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL CLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL CGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, &
+           LDW, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of CGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! CGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+          IF ( WNTRES ) THEN
+              IF ( WNTREF ) THEN
+                ! Here, if the refinement is requested, we have
+                ! A*U(:,1:K) already computed and stored in Z.
+                ! For the residuals, need Y = A * U(:,1;K) * W.
+                CALL CGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                           LDW, ZZERO, Y, LDY )              ! BLAS CALL
+                ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+                ! This frees Z; Y contains A * U(:,1:K) * W.
+              ELSE
+                ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                CALL CLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+                IF ( WNTEX ) CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+              END IF
+          ELSE IF ( WNTEX ) THEN
+              ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, B, LDB)
+                ! The above call replaces the following two calls
+                ! that were used in the developing-testing phase.
+                ! CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                !           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                ! CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+!
+          ! Compute the Ritz vectors
+          IF ( WNTVEC ) CALL CGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                       ZZERO, Z, LDZ )                          ! BLAS CALL
+          ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+          IF ( WNTRES ) THEN
+             DO i = 1, K
+                CALL CAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SCNRM2( M, Y(1,i), 1)                        ! BLAS CALL
+             END DO
+          END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE CGEDMD
+
diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c
new file mode 100644
index 000000000..6e3a1faca
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.c
@@ -0,0 +1,1289 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int cgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	complex *f, integer *ldf, complex *x, integer *ldx, complex *y, 
+	integer *ldy, integer *nrnk, real *tol, integer *k, complex *eigs, 
+	complex *z__, integer *ldz, real *res, complex *b, integer *ldb, 
+	complex *v, integer *ldv, complex *s, integer *lds, complex *zwork, 
+	integer *lzwork, real *work, integer *lwork, integer *iwork, integer *
+	liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    complex zzero;
+    extern /* Subroutine */ int cgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, integer *, real *, integer *, complex *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, complex *, integer *, real *, integer *, 
+	    integer *, integer *, integer *), 
+	    cgeqrf_(integer *, integer *, complex *, integer *, complex *, 
+	    complex *, integer *, integer *), clacpy_(char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *), 
+	    claset_(char *, integer *, integer *, complex *, complex *, 
+	    complex *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, 
+	    complex *, integer *, complex *, complex *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretised operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the inital QR facorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by CGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by CGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by CGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for CGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for CGEDMD (see the */
+/*                  description of LWORK in CGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX( LZWORK, MINMN+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX( ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in CGEDMD, because in CGEDMDQ */
+/*     only CGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for CGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], lzwork, &work[1], &c_n1, &iwork[1],
+		 liwork, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	work[1] = (real) mlrwrk;
+	work[2] = (real) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    claset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	claset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See CGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by CGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by CGEDMD. */
+	clacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to CGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	claset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	clacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* cgedmdq_ */
+
diff --git a/lapack-netlib/SRC/cgedmdq.f90 b/lapack-netlib/SRC/cgedmdq.f90
new file mode 100644
index 000000000..52c1669c7
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE CGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE 
+      INTEGER, PARAMETER :: WP = real32
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, CGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, CGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022      
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretised operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.   
+!            Q is from the inital QR facorization.    
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: CGESVD (the QR SVD algorithm)
+!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by CGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by CGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization. 
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by CGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by CGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to CGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for CGEQRF[M,N])
+!         MLWDMD = minimal workspace for CGEDMD (see the
+!                  description of LWORK in CGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX( LZWORK, MINMN+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX( ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to CGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in CGEDMD, because in CGEDMDQ
+!     only CGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1).
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then      
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEQRF, CLACPY, CLASET, CUNGQR, & 
+                    CUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for CGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+
+         IF ( LQUERY ) THEN 
+             CALL CGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)           
+         END IF
+         CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, LZWORK, WORK, -1, IWORK,&
+                      LIWORK, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK, MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL CUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK, MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK, MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK, MINMN+OLWGQR)
+            END IF            
+         END IF          
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL CGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL CLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL CLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL CLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL CLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN,    & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See CGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL CLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by CGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by CGEDMD.
+        CALL CLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL CLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to CGEDMDQ is to be 
+      
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL CLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL CLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE CGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 062ac182b..1fc75613e 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored.
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is REAL array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1819,7 +1819,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1842,7 +1842,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1886,7 +1886,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f
index 1dfbd3f2b..553087bf4 100644
--- a/lapack-netlib/SRC/cgelqt3.f
+++ b/lapack-netlib/SRC/cgelqt3.f
@@ -159,7 +159,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f
index fce4ca6e2..c3c77bf63 100644
--- a/lapack-netlib/SRC/cgelsd.f
+++ b/lapack-netlib/SRC/cgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f
index da6b9092f..d1e38c504 100644
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -214,8 +214,7 @@
 *     .. External Subroutines ..
       EXTERNAL           CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV,
      $                   CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR,
-     $                   CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+     $                   CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
@@ -388,7 +387,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/cgelsy.f b/lapack-netlib/SRC/cgelsy.f
index 67140f191..64bb10023 100644
--- a/lapack-netlib/SRC/cgelsy.f
+++ b/lapack-netlib/SRC/cgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f
index b824374d3..1838629ae 100644
--- a/lapack-netlib/SRC/cgesdd.f
+++ b/lapack-netlib/SRC/cgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f
index 1a587eca3..95091c2c0 100644
--- a/lapack-netlib/SRC/cgesvdq.f
+++ b/lapack-netlib/SRC/cgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/cgetf2.f b/lapack-netlib/SRC/cgetf2.f
index aac989970..995ee40ec 100644
--- a/lapack-netlib/SRC/cgetf2.f
+++ b/lapack-netlib/SRC/cgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO )
@@ -126,16 +126,14 @@
      $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
-      REAL               SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
       INTEGER            ICAMAX
-      EXTERNAL           SLAMCH, ICAMAX
+      EXTERNAL           ICAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGERU, CSCAL, CSWAP, XERBLA
+      EXTERNAL           CGERU, CRSCL, CSWAP, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -161,10 +159,6 @@
 *
       IF( M.EQ.0 .OR. N.EQ.0 )
      $   RETURN
-*
-*     Compute machine safe minimum
-*
-      SFMIN = SLAMCH('S')
 *
       DO 10 J = 1, MIN( M, N )
 *
@@ -181,15 +175,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f
index 37853a10a..7426ecdf1 100644
--- a/lapack-netlib/SRC/cgsvj0.f
+++ b/lapack-netlib/SRC/cgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f
index f1a5204e3..18dd690cc 100644
--- a/lapack-netlib/SRC/cgsvj1.f
+++ b/lapack-netlib/SRC/cgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f
index 1598f4de5..de33c9039 100644
--- a/lapack-netlib/SRC/chbevd.f
+++ b/lapack-netlib/SRC/chbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbevd_2stage.f b/lapack-netlib/SRC/chbevd_2stage.f
index 340c546e8..3c9c8ecc0 100644
--- a/lapack-netlib/SRC/chbevd_2stage.f
+++ b/lapack-netlib/SRC/chbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f
index c4ad20753..655006370 100644
--- a/lapack-netlib/SRC/chbgvd.f
+++ b/lapack-netlib/SRC/chbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index 2ddf74b98..dce0b2083 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cheevd_2stage.f b/lapack-netlib/SRC/cheevd_2stage.f
index 830e13d30..a0e8843ae 100644
--- a/lapack-netlib/SRC/cheevd_2stage.f
+++ b/lapack-netlib/SRC/cheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f
index f7675a19f..198e5d102 100644
--- a/lapack-netlib/SRC/chegv.f
+++ b/lapack-netlib/SRC/chegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f
index 472581c4b..d2b8fc795 100644
--- a/lapack-netlib/SRC/chegv_2stage.f
+++ b/lapack-netlib/SRC/chegv_2stage.f
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index 4b7f43d52..4edc36f2a 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -212,7 +206,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f
index 6e428242d..8e565222d 100644
--- a/lapack-netlib/SRC/chegvx.f
+++ b/lapack-netlib/SRC/chegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index e474c6fad..36970a329 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f
index 4bb032382..e687ec64a 100644
--- a/lapack-netlib/SRC/chetf2_rk.f
+++ b/lapack-netlib/SRC/chetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetf2_rook.f b/lapack-netlib/SRC/chetf2_rook.f
index ee4eaf68f..49fba1bda 100644
--- a/lapack-netlib/SRC/chetf2_rook.f
+++ b/lapack-netlib/SRC/chetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index e047ab720..30b01ed83 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -537,7 +537,7 @@ C                 END IF
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 54567b8cd..d9e4fbd19 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 20a9859e8..400efdf26 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f
index c44462394..06d01064d 100644
--- a/lapack-netlib/SRC/chpevd.f
+++ b/lapack-netlib/SRC/chpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chpgv.f b/lapack-netlib/SRC/chpgv.f
index 417f10121..660724e05 100644
--- a/lapack-netlib/SRC/chpgv.f
+++ b/lapack-netlib/SRC/chpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index 65d08b783..c24ca1360 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -205,7 +199,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgvx.f b/lapack-netlib/SRC/chpgvx.f
index 711daf55f..2646800cc 100644
--- a/lapack-netlib/SRC/chpgvx.f
+++ b/lapack-netlib/SRC/chpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f
index 6d43c8325..e8b96fd6e 100644
--- a/lapack-netlib/SRC/cla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+*       SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
 *                                       NRHS, AB, LDAB, AFB, LDAFB, IPIV,
 *                                       COLEQU, C, B, LDB, Y, LDY,
 *                                       BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -400,7 +400,7 @@
 *> \ingroup complexGBcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+      SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
      $                                NRHS, AB, LDAB, AFB, LDAFB, IPIV,
      $                                COLEQU, C, B, LDB, Y, LDY,
      $                                BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f
index 4b1031101..e524db5ad 100644
--- a/lapack-netlib/SRC/cla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gerfsx_extended.f
@@ -637,7 +637,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f
index 6d007ef58..19f845692 100644
--- a/lapack-netlib/SRC/cla_herfsx_extended.f
+++ b/lapack-netlib/SRC/cla_herfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f
index 9ced9b1b9..2dafecb35 100644
--- a/lapack-netlib/SRC/cla_porfsx_extended.f
+++ b/lapack-netlib/SRC/cla_porfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f
index 1eb706d1a..78cd19da9 100644
--- a/lapack-netlib/SRC/cla_porpvgrw.f
+++ b/lapack-netlib/SRC/cla_porpvgrw.f
@@ -140,9 +140,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f
index 4fe538a98..95f969731 100644
--- a/lapack-netlib/SRC/cla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_syrfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/claed7.f b/lapack-netlib/SRC/claed7.f
index 9d2f97141..72f1417d4 100644
--- a/lapack-netlib/SRC/claed7.f
+++ b/lapack-netlib/SRC/claed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/claed8.f b/lapack-netlib/SRC/claed8.f
index c15a0365f..1600087ab 100644
--- a/lapack-netlib/SRC/claed8.f
+++ b/lapack-netlib/SRC/claed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by SLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complexOTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL CCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL CLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/clals0.f b/lapack-netlib/SRC/clals0.f
index e981fc36f..0b545d5d7 100644
--- a/lapack-netlib/SRC/clals0.f
+++ b/lapack-netlib/SRC/clals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent optimizing
+*                    compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/clalsa.f b/lapack-netlib/SRC/clalsa.f
index 06883ab20..fceb380f5 100644
--- a/lapack-netlib/SRC/clalsa.f
+++ b/lapack-netlib/SRC/clalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> CLALSA is an itermediate step in solving the least squares problem
+*> CLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/clalsd.f b/lapack-netlib/SRC/clalsd.f
index a2da9a925..bdd6b31c5 100644
--- a/lapack-netlib/SRC/clalsd.f
+++ b/lapack-netlib/SRC/clalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f
index 6de40e06c..c6cc5847d 100644
--- a/lapack-netlib/SRC/claqz0.f
+++ b/lapack-netlib/SRC/claqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -310,7 +310,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD,
+      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET,
      $            CLARTG, CROT
       REAL, EXTERNAL :: SLAMCH, CLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -462,7 +462,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -533,7 +532,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f
index ee6959ed8..5f042e345 100644
--- a/lapack-netlib/SRC/clarfb_gett.f
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 63cbd02c6..1a09b8305 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f
index 5445e387e..1e7d71669 100644
--- a/lapack-netlib/SRC/clatdf.f
+++ b/lapack-netlib/SRC/clatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + REAL( CDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
index a902f1ed0..0502f6898 100644
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -577,7 +577,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index 7a0b85487..377190081 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/cpbsv.f b/lapack-netlib/SRC/cpbsv.f
index 248abbc1f..889bbde08 100644
--- a/lapack-netlib/SRC/cpbsv.f
+++ b/lapack-netlib/SRC/cpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpbsvx.f b/lapack-netlib/SRC/cpbsvx.f
index 652e18501..975c87768 100644
--- a/lapack-netlib/SRC/cpbsvx.f
+++ b/lapack-netlib/SRC/cpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpbtf2.f b/lapack-netlib/SRC/cpbtf2.f
index 0be2c0a7f..f5bc9b3a6 100644
--- a/lapack-netlib/SRC/cpbtf2.f
+++ b/lapack-netlib/SRC/cpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpbtrf.f b/lapack-netlib/SRC/cpbtrf.f
index a4c18efb3..af60780c8 100644
--- a/lapack-netlib/SRC/cpbtrf.f
+++ b/lapack-netlib/SRC/cpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpftrf.f b/lapack-netlib/SRC/cpftrf.f
index cbaab6832..12799c6f7 100644
--- a/lapack-netlib/SRC/cpftrf.f
+++ b/lapack-netlib/SRC/cpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/cposv.f b/lapack-netlib/SRC/cposv.f
index f37dfa3c0..ea6fc37db 100644
--- a/lapack-netlib/SRC/cposv.f
+++ b/lapack-netlib/SRC/cposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cposvx.f b/lapack-netlib/SRC/cposvx.f
index 78b9f4db1..322a26447 100644
--- a/lapack-netlib/SRC/cposvx.f
+++ b/lapack-netlib/SRC/cposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f
index 7834c75da..c40a2d856 100644
--- a/lapack-netlib/SRC/cposvxx.f
+++ b/lapack-netlib/SRC/cposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/cpotf2.f b/lapack-netlib/SRC/cpotf2.f
index 2f4658bae..d84988949 100644
--- a/lapack-netlib/SRC/cpotf2.f
+++ b/lapack-netlib/SRC/cpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf.f b/lapack-netlib/SRC/cpotrf.f
index 6aba3103e..e2b120a49 100644
--- a/lapack-netlib/SRC/cpotrf.f
+++ b/lapack-netlib/SRC/cpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f
index e1eae3e9d..ea2e4ca98 100644
--- a/lapack-netlib/SRC/cpotrf2.f
+++ b/lapack-netlib/SRC/cpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cppsv.f b/lapack-netlib/SRC/cppsv.f
index 1e6f02695..a8fd660c4 100644
--- a/lapack-netlib/SRC/cppsv.f
+++ b/lapack-netlib/SRC/cppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cppsvx.f b/lapack-netlib/SRC/cppsvx.f
index f6f07538c..2ef02100f 100644
--- a/lapack-netlib/SRC/cppsvx.f
+++ b/lapack-netlib/SRC/cppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpptrf.f b/lapack-netlib/SRC/cpptrf.f
index 4e81458cb..e36f834cb 100644
--- a/lapack-netlib/SRC/cpptrf.f
+++ b/lapack-netlib/SRC/cpptrf.f
@@ -79,9 +79,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
-*>                completed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive definite, and the factorization could
+*>                not be completed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpteqr.f b/lapack-netlib/SRC/cpteqr.f
index e3af59041..fc9c44908 100644
--- a/lapack-netlib/SRC/cpteqr.f
+++ b/lapack-netlib/SRC/cpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/cptsv.f b/lapack-netlib/SRC/cptsv.f
index 20ee32bbe..4c16f6a0a 100644
--- a/lapack-netlib/SRC/cptsv.f
+++ b/lapack-netlib/SRC/cptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cptsvx.f b/lapack-netlib/SRC/cptsvx.f
index db63a3c36..6f7d8cf5b 100644
--- a/lapack-netlib/SRC/cptsvx.f
+++ b/lapack-netlib/SRC/cptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpttrf.f b/lapack-netlib/SRC/cpttrf.f
index c6379e923..111343b78 100644
--- a/lapack-netlib/SRC/cpttrf.f
+++ b/lapack-netlib/SRC/cpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/crscl.c b/lapack-netlib/SRC/crscl.c
new file mode 100644
index 000000000..7c87553d5
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX            A */
+/*       COMPLEX            X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector X. */
+/* >          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real absi, absr;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
+	    integer *);
+    real ai, ar, ui, ov, ur;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real safmin, safmax;
+    extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer 
+	    *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ov = slamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = r_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.f) {
+/*        If alpha is real, then we can use csrscl */
+	csrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.f) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    csscal_(n, &safmin, &x[1], incx);
+	    r__1 = -safmax / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    r__1 = -safmin / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else {
+	    r__1 = -1.f / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    r__1 = safmin / ur;
+	    r__2 = -safmin / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		r__1 = 1.f / ur;
+		r__2 = -1.f / ui;
+		q__1.r = r__1, q__1.i = r__2;
+		cscal_(n, &q__1, &x[1], incx);
+	    } else {
+		csscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    r__1 = 1.f / ur;
+		    r__2 = -1.f / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		} else {
+		    r__1 = safmax / ur;
+		    r__2 = -safmax / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    r__1 = 1.f / ur;
+	    r__2 = -1.f / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of CRSCL */
+
+} /* crscl_ */
+
diff --git a/lapack-netlib/SRC/crscl.f b/lapack-netlib/SRC/crscl.f
new file mode 100644
index 000000000..22919cd62
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.f
@@ -0,0 +1,202 @@
+*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX            A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector X.
+*>          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX            A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR
+     %                   , UI
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      COMPLEX            CLADIV
+      EXTERNAL           SLAMCH, CLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSCAL, CSSCAL, CSRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = SLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = SLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = REAL( A )
+      AI = AIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL CSRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL CSSCAL( N, SAFMIN, X, INCX )
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL CSSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+               ELSE
+                  CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CRSCL
+*
+      END
diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f
index a57d9eaef..77a4ec3be 100644
--- a/lapack-netlib/SRC/cstedc.f
+++ b/lapack-netlib/SRC/cstedc.f
@@ -43,12 +43,6 @@
 *> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cstegr.f b/lapack-netlib/SRC/cstegr.f
index 9d6e06da3..a162d5076 100644
--- a/lapack-netlib/SRC/cstegr.f
+++ b/lapack-netlib/SRC/cstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : CSTEGR and CSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f
index d49684db3..9d47450e3 100644
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -329,7 +329,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f
index f3b46f0b4..febbc411c 100644
--- a/lapack-netlib/SRC/csyconvf.f
+++ b/lapack-netlib/SRC/csyconvf.f
@@ -39,7 +39,7 @@
 *> CSYTRF provided on entry in parameter A into the factorization
 *> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF into
+*> the interchanges stored in IPIV from the format used in CSYTRF into
 *> the format used in CSYTRF_RK (or CSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or CSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in CSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF_RK
+*> the interchanges stored in IPIV from the format used in CSYTRF_RK
 *> (or CSYTRF_BK) into the format used in CSYTRF.
 *>
 *> CSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f
index a5b9d82da..0da34e0fa 100644
--- a/lapack-netlib/SRC/csyconvf_rook.f
+++ b/lapack-netlib/SRC/csyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index d8881a748..22227505c 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f
index ebf228f18..951196b83 100644
--- a/lapack-netlib/SRC/csytrf.f
+++ b/lapack-netlib/SRC/csytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f
index b1165a425..c5467bf01 100644
--- a/lapack-netlib/SRC/csytrf_aa.f
+++ b/lapack-netlib/SRC/csytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index 8a2cfd7bc..b21df8cd3 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f
index 9c2b7182f..996801e7d 100644
--- a/lapack-netlib/SRC/csytrf_rk.f
+++ b/lapack-netlib/SRC/csytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ctgevc.f b/lapack-netlib/SRC/ctgevc.f
index 4e5289cb2..22144f259 100644
--- a/lapack-netlib/SRC/ctgevc.f
+++ b/lapack-netlib/SRC/ctgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by CHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, SLAMCH, CLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMV, SLABAD, XERBLA
+      EXTERNAL           CGEMV, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL
@@ -367,7 +367,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f
index f9b6cd10c..ffd638099 100644
--- a/lapack-netlib/SRC/ctgsen.f
+++ b/lapack-netlib/SRC/ctgsen.f
@@ -339,7 +339,7 @@
 *>            [ kron(In2, B11)  -kron(B22**H, In1) ].
 *>
 *>  Here, Inx is the identity matrix of size nx and A22**H is the
-*>  conjuguate transpose of A22. kron(X, Y) is the Kronecker product between
+*>  conjugate transpose of A22. kron(X, Y) is the Kronecker product between
 *>  the matrices X and Y.
 *>
 *>  When DIF(2) is small, small changes in (A, B) can cause large changes
diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f
index 0f58696b2..11b32104d 100644
--- a/lapack-netlib/SRC/ctrevc3.f
+++ b/lapack-netlib/SRC/ctrevc3.f
@@ -321,9 +321,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
-      RWORK(1) = N
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/ctrexc.f b/lapack-netlib/SRC/ctrexc.f
index ea64ddf9b..2bc0348fb 100644
--- a/lapack-netlib/SRC/ctrexc.f
+++ b/lapack-netlib/SRC/ctrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f
index 740e38a85..80faa8808 100644
--- a/lapack-netlib/SRC/cunbdb1.f
+++ b/lapack-netlib/SRC/cunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f
index b45db6100..94b9fdbf9 100644
--- a/lapack-netlib/SRC/cunbdb2.f
+++ b/lapack-netlib/SRC/cunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f
index b532bfbc8..f942bc698 100644
--- a/lapack-netlib/SRC/cunbdb3.f
+++ b/lapack-netlib/SRC/cunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f
index 117f23d08..a551c184e 100644
--- a/lapack-netlib/SRC/cunbdb4.f
+++ b/lapack-netlib/SRC/cunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f
index 8360d5932..f0c44f670 100644
--- a/lapack-netlib/SRC/cuncsd2by1.f
+++ b/lapack-netlib/SRC/cuncsd2by1.f
@@ -212,13 +212,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f
index 64ccb9731..1734be2aa 100644
--- a/lapack-netlib/SRC/cungtsqr.f
+++ b/lapack-netlib/SRC/cungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of CUNGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dbdsdc.f b/lapack-netlib/SRC/dbdsdc.f
index 99fe82296..4b6c3e694 100644
--- a/lapack-netlib/SRC/dbdsdc.f
+++ b/lapack-netlib/SRC/dbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. DBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLASD3 for details.
-*>
 *> The code currently calls DLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f
index c220a5875..bc697a007 100644
--- a/lapack-netlib/SRC/dbdsqr.f
+++ b/lapack-netlib/SRC/dbdsqr.f
@@ -278,7 +278,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -391,7 +391,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -451,7 +451,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -460,7 +459,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -542,14 +540,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -570,14 +568,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -587,7 +585,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f
index 4ccd4edad..4668a88f2 100644
--- a/lapack-netlib/SRC/dbdsvdx.f
+++ b/lapack-netlib/SRC/dbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], DBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/dgebal.f b/lapack-netlib/SRC/dgebal.f
index 821c7704a..f7b38b378 100644
--- a/lapack-netlib/SRC/dgebal.f
+++ b/lapack-netlib/SRC/dgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -214,177 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL DSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL DSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL DSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL DSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL DSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL DSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
 *
-  140 CONTINUE
-      NOCONV = .FALSE.
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-      DO 200 I = K, L
+         DO I = K, L
 *
-         C = DNRM2( L-K+1, A( K, I ), 1 )
-         R = DNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IDAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IDAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
+            C = DNRM2( L-K+1, A( K, I ), 1 )
+            R = DNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IDAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IDAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-*        Guard against zero C or R due to underflow.
+*           Guard against zero C or R due to underflow.
 *
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
 *
 *           Exit if NaN to avoid infinite loop
 *
-            INFO = -3
-            CALL XERBLA( 'DGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
-*
-         CALL DSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL DSCAL( L, F, A( 1, I ), 1 )
-*
-  200 CONTINUE
-*
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'DGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL DSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL DSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c
new file mode 100644
index 000000000..66b4d5da6
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.c
@@ -0,0 +1,1753 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublereal *x, integer *ldx, 
+	doublereal *y, integer *ldy, integer *nrnk, doublereal *tol, integer *
+	k, doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz,
+	 doublereal *res, doublereal *b, integer *ldb, doublereal *w, integer 
+	*ldw, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal zero, ssum;
+    integer info1, info2;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal scale;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgeev_(char *, char *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, doublereal *, integer *, integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    char jobzl[1];
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *);
+    logical wntex;
+    doublereal ab[4]	/* was [2][2] */;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dgesdd_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *), dlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    integer *, integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int dgesvd_(char *, char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, integer *), xerbla_(char *, 
+	    integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int dgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, integer *), dlassq_(integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    doublereal rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    doublereal rdummy2[2], ofl, one;
+    extern /* Subroutine */ int dgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublereal *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1). */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to DGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     leng of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of DGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of DGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of DGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of DGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) */
+/* MLWORK = N +  MWRSVQ */
+/* IMINWR = M+N-1 */
+	    dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], liwork, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = mwrsvq, i__2 = (integer) rdummy[0];
+		lwrsvq = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the DGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    dgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using DLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("DGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (y[idamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    dscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using DLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (x[idamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+/* L */
+	dlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	dgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	dlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case DGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    dlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / work[i__];
+	    dscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that DGESVD, DGESVDQ and DGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside DGEDMD). */
+	dgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	dgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	dgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two DGEMM calls here, can use K for LDZ. */
+/* B */
+	dgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		dlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		dlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    dgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of DGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* DGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		dgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		dlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    dgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    d__1 = -reig[i__];
+		    daxpy_(m, &d__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = dnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    d__1 = -one;
+		    dgemm_("N", "N", m, &c__2, &c__2, &d__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = dlange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* dgedmd_ */
+
diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90
new file mode 100644
index 000000000..20424808f
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.f90
@@ -0,0 +1,1054 @@
+      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)  :: TOL
+!     Array arguments
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     DGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, DGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, DGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!............................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: DGESVD (the QR SVD algorithm)
+!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!     < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!     > On entry, Y contains the data snapshot matrix Y
+!     < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1).
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     REIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, and Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of IMEIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, and Z.
+!.....
+!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value; ||Z(:,i)||_2=1.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!        || Z(:,i:i+1)||_F = 1.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!     are similarly structured: If IMEIG(i) == 0 then
+!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs.
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of REIG, IMEIG and Z.
+!.....
+!     B (output) REAL(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1;K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) REAL(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient (real and
+!     imaginary parts for each complex conjugate pair of the
+!     eigenvalues). The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) REAL(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by DGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, WORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to DGEDMD is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     leng of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the workspace vector WORK.
+!     LWORK is calculated as follows:
+!     If WHTSVD == 1 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!        If JOBZ == 'N'  then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!        workspace length of DGESVD.
+!     If WHTSVD == 2 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!        minimal workspace length of DGESDD.
+!     If WHTSVD == 3 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = N+M+MAX(3*N+1,
+!                        MAX(1,3*N+M,5*N),MAX(1,N))
+!        is the minimal workspace length of DGESVDQ.
+!     If WHTSVD == 4 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!        minimal workspace length of DGEJSV.
+!     The above expressions are not simplified in order to
+!     make the usage of WORK more transparent, and for
+!     easier checking. In any case, LWORK >= 2.
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,    ROOTSC, SCALE,  SMALL,  &
+                       SSUM,   XSCL1,  XSCL2
+      INTEGER       :: i,   j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD,         &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       :: BADXY,  LQUERY, SCCOLX, SCCOLY, &
+                       WNTEX,  WNTREF, WNTRES, WNTVEC
+      CHARACTER     :: JOBZL,  T_OR_N
+      CHARACTER     :: JSVOPT
+
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) DLANGE, DLAMCH, DNRM2
+      EXTERNAL      DLANGE, DLAMCH, DNRM2, IDAMAX
+      INTEGER       IDAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DAXPY,  DGEMM,  DSCAL
+      EXTERNAL      DGEEV,  DGEJSV, DGESDD, DGESVD, DGESVDQ, &
+                    DLACPY, DLASCL, DLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL DGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2)
+             !MLWORK = N +  MWRSVQ
+             !IMINWR = M+N-1
+             CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                             X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, LIWORK, RDUMMY,  &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = MAX( MWRSVQ, INT(RDUMMY(1)) )
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK =  MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the DGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL DGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using DLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(IDAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL DSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using DLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(IDAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL DGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL DLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL DGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL DLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case DGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL DLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL DSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that DGESVD, DGESVDQ and DGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside DGEDMD).
+          CALL DGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL DGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL DGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two DGEMM calls here, can use K for LDZ.
+        CALL DGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL DLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL DLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL DGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of DGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! DGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL DGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS)
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL DLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = DNRM2( M, Y(1,i), 1)                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = DLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE DGEDMD
+
diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c
new file mode 100644
index 000000000..a743a3156
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.c
@@ -0,0 +1,1300 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int dgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublereal *f, integer *ldf, doublereal *x, integer *ldx, doublereal *
+	y, integer *ldy, integer *nrnk, doublereal *tol, integer *k, 
+	doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz, 
+	doublereal *res, doublereal *b, integer *ldb, doublereal *v, integer *
+	ldv, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int dgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *, integer *, integer *, integer 
+	    *), dgeqrf_(integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlaset_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, doublereal *, 
+	    integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dormqr_(char *, char *, integer *, integer *, integer 
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    doublereal rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by DGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consequtive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by DGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to DGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for DGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for DGEDMD (see the */
+/*                  description of LWORK in DGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    DORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for DORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for DGEQRF. */
+	mlwork = minmn + mlwqr;
+	if (lquery) {
+	    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    dlaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	dlaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See DGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by DGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by DGEDMD. */
+	dlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to DGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	dlaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	dlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* dgedmdq_ */
+
diff --git a/lapack-netlib/SRC/dgedmdq.f90 b/lapack-netlib/SRC/dgedmdq.f90
new file mode 100644
index 000000000..bedfba472
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.f90
@@ -0,0 +1,704 @@
+SUBROUTINE DGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64      
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,    K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, DGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, DGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022  
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.  
+!            Q is from the initial QR factorization.      
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!     To be useful on exit, this option needs JOBQ='Q'.
+!.....      
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: DGESVD (the QR SVD algorithm)
+!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by DGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.       
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consequtive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.
+!     Here A denotes a compression of the underlying operator.
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by DGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by DGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to DGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for DGEQRF[M,N])
+!         MLWDMD = minimal workspace for DGEDMD (see the
+!                  description of LWORK in DGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    DORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for DORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then    
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEMM 
+      EXTERNAL      DGEQRF, DLACPY, DLASET, DORGQR, & 
+                    DORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for DGEQRF.
+         MLWORK = MINMN + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL DGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR  = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL DORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL DGEQRF( M, N, F, LDF, WORK,               & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL DLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL DLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL DLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL DLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, & 
+                   IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See DGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL DLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by DGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by DGEDMD.
+        CALL DLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL DLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to DGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL DLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL DLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN               ! Q overwrites F 
+         CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE DGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f
index 83d16c30e..1db85e9c2 100644
--- a/lapack-netlib/SRC/dgejsv.f
+++ b/lapack-netlib/SRC/dgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
@@ -362,7 +362,7 @@
 *>
 *> \param[out] IWORK
 *> \verbatim
-*>          IWORK is INTEGER array, dimension (M+3*N).
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
 *>          On exit,
 *>          IWORK(1) = the numerical rank determined after the initial
 *>                     QR factorization with pivoting. See the descriptions
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f
index 5bcc06a80..ee3bdceb4 100644
--- a/lapack-netlib/SRC/dgelqt3.f
+++ b/lapack-netlib/SRC/dgelqt3.f
@@ -173,7 +173,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f
index b3b3d8b2d..b1f45a2c6 100644
--- a/lapack-netlib/SRC/dgelsd.f
+++ b/lapack-netlib/SRC/dgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f
index c4190f2e0..38449be7f 100644
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV,
-     $                   DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR,
+     $                   DGEQRF, DLACPY, DLASCL, DLASET, DORGBR,
      $                   DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@@ -385,7 +385,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -529,7 +528,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -626,7 +625,7 @@
                CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -712,7 +711,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/dgelsy.f b/lapack-netlib/SRC/dgelsy.f
index aebab9264..e9fcd9682 100644
--- a/lapack-netlib/SRC/dgelsy.f
+++ b/lapack-netlib/SRC/dgelsy.f
@@ -115,6 +115,7 @@
 *>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 738a122e8..87a4e702d 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f
index 6f9ac703e..a514def8b 100644
--- a/lapack-netlib/SRC/dgesvdq.f
+++ b/lapack-netlib/SRC/dgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f
index 197a9a626..1008aa8c6 100644
--- a/lapack-netlib/SRC/dgsvj0.f
+++ b/lapack-netlib/SRC/dgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f
index 11d1dde2b..f1a099b2a 100644
--- a/lapack-netlib/SRC/dgsvj1.f
+++ b/lapack-netlib/SRC/dgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V', then N rows of V are post-multipled by a
+*>          If JOBV = 'V', then N rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f
index 5454b1bfc..b331178a9 100644
--- a/lapack-netlib/SRC/dla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f
@@ -645,7 +645,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f
index 92b0d76d4..bf8260d04 100644
--- a/lapack-netlib/SRC/dla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gerfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f
index 5c8850fef..94d1087cd 100644
--- a/lapack-netlib/SRC/dla_porfsx_extended.f
+++ b/lapack-netlib/SRC/dla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f
index 93ad3eb6a..00fdd7ae1 100644
--- a/lapack-netlib/SRC/dla_porpvgrw.f
+++ b/lapack-netlib/SRC/dla_porpvgrw.f
@@ -134,9 +134,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f
index e1cde6fc2..2e06a622b 100644
--- a/lapack-netlib/SRC/dla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f
index 9b1f1e093..1a53650e8 100644
--- a/lapack-netlib/SRC/dlaed2.f
+++ b/lapack-netlib/SRC/dlaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL DCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL DLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL DCOPY( N, DLAMDA, 1, D, 1 )
+         CALL DCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/dlaed3.f b/lapack-netlib/SRC/dlaed3.f
index c58944e60..f9982c89e 100644
--- a/lapack-netlib/SRC/dlaed3.f
+++ b/lapack-netlib/SRC/dlaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLAED4, DLASET, XERBLA
@@ -240,29 +232,9 @@
       IF( K.EQ.0 )
      $   RETURN
 *
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +265,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f
index 3631fb456..5d1d9144d 100644
--- a/lapack-netlib/SRC/dlaed8.f
+++ b/lapack-netlib/SRC/dlaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL DCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL DLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/dlaed9.f b/lapack-netlib/SRC/dlaed9.f
index b88cdd907..0d209c2c2 100644
--- a/lapack-netlib/SRC/dlaed9.f
+++ b/lapack-netlib/SRC/dlaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       DOUBLE PRECISION   RHO
 *       ..
 *       .. Array Arguments ..
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       DOUBLE PRECISION   RHO
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/dlals0.f b/lapack-netlib/SRC/dlals0.f
index cfca22280..928405e22 100644
--- a/lapack-netlib/SRC/dlals0.f
+++ b/lapack-netlib/SRC/dlals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/dlalsa.f b/lapack-netlib/SRC/dlalsa.f
index da8e0fa17..d89f5d2f9 100644
--- a/lapack-netlib/SRC/dlalsa.f
+++ b/lapack-netlib/SRC/dlalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> DLALSA is an itermediate step in solving the least squares problem
+*> DLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/dlalsd.f b/lapack-netlib/SRC/dlalsd.f
index d22c45dc6..706ac4c90 100644
--- a/lapack-netlib/SRC/dlalsd.f
+++ b/lapack-netlib/SRC/dlalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f
index c4cb95fd3..84cb96bcb 100644
--- a/lapack-netlib/SRC/dlaqz0.f
+++ b/lapack-netlib/SRC/dlaqz0.f
@@ -102,7 +102,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -332,7 +332,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD,
+      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4,
      $            DLARTG, DROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -482,7 +482,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -567,7 +566,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f
index 10ab6461e..2c7ea59b0 100644
--- a/lapack-netlib/SRC/dlarfb_gett.f
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f
index 70f59b829..1cceed1a2 100644
--- a/lapack-netlib/SRC/dlarre.f
+++ b/lapack-netlib/SRC/dlarre.f
@@ -51,7 +51,7 @@
 *> DSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, DLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dlaruv.f b/lapack-netlib/SRC/dlaruv.f
index 0f5c9541d..1a4ce4009 100644
--- a/lapack-netlib/SRC/dlaruv.f
+++ b/lapack-netlib/SRC/dlaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )
diff --git a/lapack-netlib/SRC/dlas2.f b/lapack-netlib/SRC/dlas2.f
index ea929e86f..ff75e3942 100644
--- a/lapack-netlib/SRC/dlas2.f
+++ b/lapack-netlib/SRC/dlas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dlasd0.f b/lapack-netlib/SRC/dlasd0.f
index 215dc8717..6d8d4e2db 100644
--- a/lapack-netlib/SRC/dlasd0.f
+++ b/lapack-netlib/SRC/dlasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is DOUBLE PRECISION array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors, 
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is DOUBLE PRECISION array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT
diff --git a/lapack-netlib/SRC/dlasd3.f b/lapack-netlib/SRC/dlasd3.f
index df939efc5..44957377b 100644
--- a/lapack-netlib/SRC/dlasd3.f
+++ b/lapack-netlib/SRC/dlasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to DLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> DLASD3 is called from DLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       DOUBLE PRECISION   RHO, TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLASCL, DLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL DCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/dlasd8.f b/lapack-netlib/SRC/dlasd8.f
index a769bdb22..73c3ef0b4 100644
--- a/lapack-netlib/SRC/dlasd8.f
+++ b/lapack-netlib/SRC/dlasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine DLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( DLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/dlasv2.f b/lapack-netlib/SRC/dlasv2.f
index 64a06dee1..cb2bf51c4 100644
--- a/lapack-netlib/SRC/dlasv2.f
+++ b/lapack-netlib/SRC/dlasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index fb8857145..c95c94cbc 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f
index be156bee2..b282f4227 100644
--- a/lapack-netlib/SRC/dlatrs.f
+++ b/lapack-netlib/SRC/dlatrs.f
@@ -261,6 +261,9 @@
       DOUBLE PRECISION   BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   WORK(1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            IDAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
index b4a98bc78..e6d78b672 100644
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -574,7 +574,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index f5cbb76bb..94a04be02 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/dorbdb1.f b/lapack-netlib/SRC/dorbdb1.f
index dae18f5df..b5b2d1362 100644
--- a/lapack-netlib/SRC/dorbdb1.f
+++ b/lapack-netlib/SRC/dorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f
index a0dacbb16..0b4ad732c 100644
--- a/lapack-netlib/SRC/dorbdb2.f
+++ b/lapack-netlib/SRC/dorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb3.f b/lapack-netlib/SRC/dorbdb3.f
index 7b3727956..79b10a5d3 100644
--- a/lapack-netlib/SRC/dorbdb3.f
+++ b/lapack-netlib/SRC/dorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f
index 08604be45..985be3277 100644
--- a/lapack-netlib/SRC/dorbdb4.f
+++ b/lapack-netlib/SRC/dorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> DORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f
index 34d96b238..2e5cd40be 100644
--- a/lapack-netlib/SRC/dorgtsqr.f
+++ b/lapack-netlib/SRC/dorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -301,4 +302,4 @@
 *
 *     End of DORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dpbsv.f b/lapack-netlib/SRC/dpbsv.f
index a52e78309..2d8f06440 100644
--- a/lapack-netlib/SRC/dpbsv.f
+++ b/lapack-netlib/SRC/dpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dpbsvx.f b/lapack-netlib/SRC/dpbsvx.f
index 1bf526fc7..142dda5da 100644
--- a/lapack-netlib/SRC/dpbsvx.f
+++ b/lapack-netlib/SRC/dpbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpbtf2.f b/lapack-netlib/SRC/dpbtf2.f
index 534629294..1284c9ec8 100644
--- a/lapack-netlib/SRC/dpbtf2.f
+++ b/lapack-netlib/SRC/dpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpbtrf.f b/lapack-netlib/SRC/dpbtrf.f
index 8256f8938..29e9aaecd 100644
--- a/lapack-netlib/SRC/dpbtrf.f
+++ b/lapack-netlib/SRC/dpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpftrf.f b/lapack-netlib/SRC/dpftrf.f
index 980debaf2..312dcdf84 100644
--- a/lapack-netlib/SRC/dpftrf.f
+++ b/lapack-netlib/SRC/dpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dposv.f b/lapack-netlib/SRC/dposv.f
index ee2988e6f..cb76e9977 100644
--- a/lapack-netlib/SRC/dposv.f
+++ b/lapack-netlib/SRC/dposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dposvx.f b/lapack-netlib/SRC/dposvx.f
index 4a0b9d605..faffff803 100644
--- a/lapack-netlib/SRC/dposvx.f
+++ b/lapack-netlib/SRC/dposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f
index e74b23c80..b5336cf63 100644
--- a/lapack-netlib/SRC/dposvxx.f
+++ b/lapack-netlib/SRC/dposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/dpotf2.f b/lapack-netlib/SRC/dpotf2.f
index 08fa4957f..30da5c3f3 100644
--- a/lapack-netlib/SRC/dpotf2.f
+++ b/lapack-netlib/SRC/dpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf.f b/lapack-netlib/SRC/dpotrf.f
index 1679fc3cd..65509feb8 100644
--- a/lapack-netlib/SRC/dpotrf.f
+++ b/lapack-netlib/SRC/dpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf2.f b/lapack-netlib/SRC/dpotrf2.f
index 6c28ce6d6..aaf9b9c58 100644
--- a/lapack-netlib/SRC/dpotrf2.f
+++ b/lapack-netlib/SRC/dpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dppsv.f b/lapack-netlib/SRC/dppsv.f
index 435703b08..1888005d9 100644
--- a/lapack-netlib/SRC/dppsv.f
+++ b/lapack-netlib/SRC/dppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dppsvx.f b/lapack-netlib/SRC/dppsvx.f
index cb41d39ee..3b08fc821 100644
--- a/lapack-netlib/SRC/dppsvx.f
+++ b/lapack-netlib/SRC/dppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpptrf.f b/lapack-netlib/SRC/dpptrf.f
index 2d8de5110..d9eed910d 100644
--- a/lapack-netlib/SRC/dpptrf.f
+++ b/lapack-netlib/SRC/dpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpteqr.f b/lapack-netlib/SRC/dpteqr.f
index aa1f1a80c..d07b065b0 100644
--- a/lapack-netlib/SRC/dpteqr.f
+++ b/lapack-netlib/SRC/dpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/dptsv.f b/lapack-netlib/SRC/dptsv.f
index addc34b88..41d8cff15 100644
--- a/lapack-netlib/SRC/dptsv.f
+++ b/lapack-netlib/SRC/dptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dptsvx.f b/lapack-netlib/SRC/dptsvx.f
index 7fb6cf436..fcbf5aa3b 100644
--- a/lapack-netlib/SRC/dptsvx.f
+++ b/lapack-netlib/SRC/dptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpttrf.f b/lapack-netlib/SRC/dpttrf.f
index e0022e3ad..156e845a3 100644
--- a/lapack-netlib/SRC/dpttrf.f
+++ b/lapack-netlib/SRC/dpttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsbevd.f b/lapack-netlib/SRC/dsbevd.f
index 3eb4ed8df..350c0a9f0 100644
--- a/lapack-netlib/SRC/dsbevd.f
+++ b/lapack-netlib/SRC/dsbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbevd_2stage.f b/lapack-netlib/SRC/dsbevd_2stage.f
index 45a64b478..82997c850 100644
--- a/lapack-netlib/SRC/dsbevd_2stage.f
+++ b/lapack-netlib/SRC/dsbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbgvd.f b/lapack-netlib/SRC/dsbgvd.f
index 30b016611..0ab3177ac 100644
--- a/lapack-netlib/SRC/dsbgvd.f
+++ b/lapack-netlib/SRC/dsbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f
index 20de17931..53deae580 100644
--- a/lapack-netlib/SRC/dsbgvx.f
+++ b/lapack-netlib/SRC/dsbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  DPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspevd.f b/lapack-netlib/SRC/dspevd.f
index d9d6c8917..05aa91b03 100644
--- a/lapack-netlib/SRC/dspevd.f
+++ b/lapack-netlib/SRC/dspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dspgv.f b/lapack-netlib/SRC/dspgv.f
index d8ec3b1a4..96041c301 100644
--- a/lapack-netlib/SRC/dspgv.f
+++ b/lapack-netlib/SRC/dspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f
index df215ae1a..24c2309c3 100644
--- a/lapack-netlib/SRC/dspgvd.f
+++ b/lapack-netlib/SRC/dspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +178,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvx.f b/lapack-netlib/SRC/dspgvx.f
index ec93147aa..5afd73d02 100644
--- a/lapack-netlib/SRC/dspgvx.f
+++ b/lapack-netlib/SRC/dspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f
index c3b8de0e3..0bd75698d 100644
--- a/lapack-netlib/SRC/dsposv.f
+++ b/lapack-netlib/SRC/dsposv.f
@@ -177,8 +177,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of (DOUBLE
-*>                PRECISION) A is not positive definite, so the
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (DOUBLE PRECISION) A is not positive, so the
 *>                factorization could not be completed, and the solution
 *>                has not been computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dstedc.f b/lapack-netlib/SRC/dstedc.f
index 2ed84afaa..6d533664b 100644
--- a/lapack-netlib/SRC/dstedc.f
+++ b/lapack-netlib/SRC/dstedc.f
@@ -42,12 +42,6 @@
 *> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dstegr.f b/lapack-netlib/SRC/dstegr.f
index 01ec101d8..598c60e8d 100644
--- a/lapack-netlib/SRC/dstegr.f
+++ b/lapack-netlib/SRC/dstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : DSTEGR and DSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f
index d0c71ddd9..44a33423e 100644
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -380,6 +382,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -502,6 +505,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -509,8 +521,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -533,8 +550,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/dstevd.f b/lapack-netlib/SRC/dstevd.f
index 507f39b2b..54717df3d 100644
--- a/lapack-netlib/SRC/dstevd.f
+++ b/lapack-netlib/SRC/dstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f
index 9e7a5af0e..005c98a03 100644
--- a/lapack-netlib/SRC/dsyconvf.f
+++ b/lapack-netlib/SRC/dsyconvf.f
@@ -39,7 +39,7 @@
 *> DSYTRF provided on entry in parameter A into the factorization
 *> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF into
+*> the interchanges stored in IPIV from the format used in DSYTRF into
 *> the format used in DSYTRF_RK (or DSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or DSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in DSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF_RK
+*> the interchanges stored in IPIV from the format used in DSYTRF_RK
 *> (or DSYTRF_BK) into the format used in DSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f
index d7f529358..c3f2083bd 100644
--- a/lapack-netlib/SRC/dsyconvf_rook.f
+++ b/lapack-netlib/SRC/dsyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index eaaecd8d9..b27f4cdc7 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, DSYEVD needs N**2 more
 *> workspace than DSYEVX.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f
index 0eae8ad06..d5a68c35d 100644
--- a/lapack-netlib/SRC/dsyevd_2stage.f
+++ b/lapack-netlib/SRC/dsyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsygv.f b/lapack-netlib/SRC/dsygv.f
index 5208dbb1f..02a4cc3ed 100644
--- a/lapack-netlib/SRC/dsygv.f
+++ b/lapack-netlib/SRC/dsygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygv_2stage.f b/lapack-netlib/SRC/dsygv_2stage.f
index 5c71ebf94..383304267 100644
--- a/lapack-netlib/SRC/dsygv_2stage.f
+++ b/lapack-netlib/SRC/dsygv_2stage.f
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f
index 3b38665a7..41a384c80 100644
--- a/lapack-netlib/SRC/dsygvd.f
+++ b/lapack-netlib/SRC/dsygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -190,7 +184,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvx.f b/lapack-netlib/SRC/dsygvx.f
index 3fa55b97c..2dc27e8a8 100644
--- a/lapack-netlib/SRC/dsygvx.f
+++ b/lapack-netlib/SRC/dsygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 1100702ba..72fbe1e9a 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -89,7 +89,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index fabc10756..bb74dd491 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -506,7 +506,7 @@
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index a39b03283..aee9b3f6a 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 346737953..9a0b26ce5 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index b7f5f07c2..c65bd86e6 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f
index 7341b9263..086586968 100644
--- a/lapack-netlib/SRC/dsytrf_rk.f
+++ b/lapack-netlib/SRC/dsytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dtgevc.f b/lapack-netlib/SRC/dtgevc.f
index e7084664c..be70b2083 100644
--- a/lapack-netlib/SRC/dtgevc.f
+++ b/lapack-netlib/SRC/dtgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, DLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEMV, DLABAD, DLACPY, DLAG2, DLALN2, XERBLA
+      EXTERNAL           DGEMV, DLACPY, DLAG2, DLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/dtgsna.f b/lapack-netlib/SRC/dtgsna.f
index b9b3ad8af..013dc91bd 100644
--- a/lapack-netlib/SRC/dtgsna.f
+++ b/lapack-netlib/SRC/dtgsna.f
@@ -632,8 +632,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0D0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *
diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f
index a4651e788..c8c04ad13 100644
--- a/lapack-netlib/SRC/dtrevc3.f
+++ b/lapack-netlib/SRC/dtrevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f
index c44ec3808..31a5230ba 100644
--- a/lapack-netlib/SRC/dtrsyl3.f
+++ b/lapack-netlib/SRC/dtrsyl3.f
@@ -1220,7 +1220,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F
index c701c2be0..52d507e9d 100644
--- a/lapack-netlib/SRC/iparam2stage.F
+++ b/lapack-netlib/SRC/iparam2stage.F
@@ -89,14 +89,14 @@
 *>
 *> \param[in] NBI
 *> \verbatim
-*>          NBI is INTEGER which is the used in the reduciton, 
+*>          NBI is INTEGER which is the used in the reduction, 
 *>          (e.g., the size of the band), needed to compute workspace
 *>          and LHOUS2.
 *> \endverbatim
 *>
 *> \param[in] IBI
 *> \verbatim
-*>          IBI is INTEGER which represent the IB of the reduciton,
+*>          IBI is INTEGER which represent the IB of the reduction,
 *>          needed to compute workspace and LHOUS2.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/sbdsdc.f b/lapack-netlib/SRC/sbdsdc.f
index 18a404497..2a6cc9970 100644
--- a/lapack-netlib/SRC/sbdsdc.f
+++ b/lapack-netlib/SRC/sbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. SBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLASD3 for details.
-*>
 *> The code currently calls SLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/sbdsqr.f b/lapack-netlib/SRC/sbdsqr.f
index c798baaf5..880f0607b 100644
--- a/lapack-netlib/SRC/sbdsqr.f
+++ b/lapack-netlib/SRC/sbdsqr.f
@@ -277,7 +277,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL,  SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -390,7 +390,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -450,7 +450,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -459,7 +458,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -541,14 +539,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -569,14 +567,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -586,7 +584,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f
index dcd554f1d..d7cb2dc83 100644
--- a/lapack-netlib/SRC/sbdsvdx.f
+++ b/lapack-netlib/SRC/sbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], SBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/sgebal.f b/lapack-netlib/SRC/sgebal.f
index f519c8c57..7c115fb6c 100644
--- a/lapack-netlib/SRC/sgebal.f
+++ b/lapack-netlib/SRC/sgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -197,7 +200,7 @@
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
-*
+*     ..
 *     Test the input parameters
 *
       INFO = 0
@@ -214,176 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL SSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL SSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
+*     Permutation to isolate eigenvalues if possible.
 *
-   50 CONTINUE
-      DO 70 J = L, 1, -1
-*
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL SSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL SSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL SSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL SSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SNRM2( L-K+1, A( K, I ), 1 )
-         R = SNRM2( L-K+1, A( I, K ), LDA )
-         ICA = ISAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ISAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'SGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
+*
+            C = SNRM2( L-K+1, A( K, I ), 1 )
+            R = SNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ISAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ISAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-         CALL SSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL SSCAL( L, F, A( 1, I ), 1 )
+*           Guard against zero C or R due to underflow.
 *
-  200 CONTINUE
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'SGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL SSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL SSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c
new file mode 100644
index 000000000..c8f3a5964
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.c
@@ -0,0 +1,1746 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, real *x, integer *ldx, real *
+	y, integer *ldy, integer *nrnk, real *tol, integer *k, real *reig, 
+	real *imeig, real *z__, integer *ldz, real *res, real *b, integer *
+	ldb, real *w, integer *ldw, real *s, integer *lds, real *work, 
+	integer *lwork, integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    logical badxy;
+    real small;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *), sgeev_(char *, char *, 
+	    integer *, real *, integer *, real *, real *, real *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, 
+	    real *, integer *);
+    logical wntex;
+    real ab[4]	/* was [2][2] */;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    extern /* Subroutine */ int sgesdd_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, integer *, integer *), xerbla_(char *, integer 
+	    *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, real *, integer *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    extern /* Subroutine */ int sgesvd_(char *, char *, integer *, integer *, 
+	    real *, integer *, real *, real *, integer *, real *, integer *, 
+	    real *, integer *, integer *);
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int sgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, real *, integer *, real *, real *,
+	     integer *, real *, integer *, real *, integer *, integer *, 
+	    integer *), 
+	    slacpy_(char *, integer *, integer *, real *, integer *, real *, 
+	    integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    real rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, 
+	    real *), mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    real rdummy2[2], ofl, one;
+    extern /* Subroutine */ int sgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, real *, integer *, real *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     left singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to SGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of SGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of SGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of SGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of SGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) */
+/* MLWORK = N + MWRSVQ */
+/* IMINWR = M+N-1 */
+	    sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) rdummy[0];
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+/*         END SELECT */
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the SGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    sgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using SLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("SGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (y[isamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    sscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using SLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (x[isamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+
+	slacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	sgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	slacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case SGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    slascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / work[i__];
+	    sscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that SGESVD, SGESVDQ and SGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside SGEDMD). */
+	sgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	sgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	sgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two SGEMM calls here, can use K for LDZ */
+/* B */
+	sgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		slacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		slacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    sgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of SGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* SGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		sgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		slacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    sgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    r__1 = -reig[i__];
+		    saxpy_(m, &r__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = snrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    r__1 = -one;
+		    sgemm_("N", "N", m, &c__2, &c__2, &r__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = slange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* sgedmd_ */
+
diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90
new file mode 100644
index 000000000..49cb11527
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.f90
@@ -0,0 +1,1054 @@
+      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN) ::  TOL
+!     Array arguments
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     SGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, SGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, SGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!......................................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: SGESVD (the QR SVD algorithm)
+!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!     < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!     > On entry, Y contains the data snapshot matrix Y
+!     < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     REIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, and Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of IMEIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, and Z.
+!.....
+!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value; ||Z(:,i)||_2=1.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!        || Z(:,i:i+1)||_F = 1.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!     are similarly structured: If IMEIG(i) == 0 then
+!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs.
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of REIG, IMEIG and Z.
+!.....
+!     B (output) REAL(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1;K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) REAL(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient (real and
+!     imaginary parts for each complex conjugate pair of the
+!     eigenvalues). The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     left singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) REAL(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by SGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, WORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to SGEDMD is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the workspace vector WORK.
+!     LWORK is calculated as follows:
+!     If WHTSVD == 1 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!        If JOBZ == 'N'  then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!        workspace length of SGESVD.
+!     If WHTSVD == 2 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!        minimal workspace length of SGESDD.
+!     If WHTSVD == 3 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = N+M+MAX(3*N+1,
+!                        MAX(1,3*N+M,5*N),MAX(1,N))
+!        is the minimal workspace length of SGESVDQ.
+!     If WHTSVD == 4 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!        minimal workspace length of SGEJSV.
+!     The above expressions are not simplified in order to
+!     make the usage of WORK more transparent, and for
+!     easier checking. In any case, LWORK >= 2.
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) SLANGE, SLAMCH, SNRM2
+      EXTERNAL      SLANGE, SLAMCH, SNRM2, ISAMAX
+      INTEGER       ISAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SAXPY,  SGEMM,  SSCAL
+      EXTERNAL      SGEEV,  SGEJSV, SGESDD, SGESVD, SGESVDQ, &
+                    SLACPY, SLASCL, SLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     INT, FLOAT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL SGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2)
+             !MLWORK = N + MWRSVQ
+             !IMINWR = M+N-1
+             CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                          X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, -1, RDUMMY,   &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(RDUMMY(1))
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK = MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the SGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL SGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using SLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(ISAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL SSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using SLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(ISAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL SGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL SLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL SGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL SLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case SGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL SLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL SSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that SGESVD, SGESVDQ and SGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside SGEDMD).
+          CALL SGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL SGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL SGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two SGEMM calls here, can use K for LDZ
+        CALL SGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL SLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL SLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL SGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of SGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! SGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL SGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL SLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SNRM2( M, Y(1,i), 1 )                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = SLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE SGEDMD
+
diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c
new file mode 100644
index 000000000..0adf3bda3
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.c
@@ -0,0 +1,1296 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int sgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, real 
+	*f, integer *ldf, real *x, integer *ldx, real *y, integer *ldy, 
+	integer *nrnk, real *tol, integer *k, real *reig, real *imeig, real *
+	z__, integer *ldz, real *res, real *b, integer *ldb, real *v, integer 
+	*ldv, real *s, integer *lds, real *work, integer *lwork, integer *
+	iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int sgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, real *, integer *, real *, 
+	    integer *, integer *, real *, integer *, real *, real *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer 
+	    *, real *, real *, integer *, integer *);
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, 
+	    integer *, real *, integer *), slaset_(char *, integer *, 
+	    integer *, real *, real *, real *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    real rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real 
+	    *, integer *, real *, real *, integer *, integer *), sormqr_(char 
+	    *, char *, integer *, integer *, integer *, real *, integer *, 
+	    real *, real *, integer *, real *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F) */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by SGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by SGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to SGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for SGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for SGEDMD (see the */
+/*                  description of LWORK in SGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    SORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for SORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a worskpace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+/* .......................................................... */
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for SGEQRF. */
+	mlwork = f2cmin(*m,*n) + mlwqr;
+	if (lquery) {
+	    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    slaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	slaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by SGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by SGEDMD. */
+	slacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to SGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	slaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	slacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* sgedmdq_ */
+
diff --git a/lapack-netlib/SRC/sgedmdq.f90 b/lapack-netlib/SRC/sgedmdq.f90
new file mode 100644
index 000000000..acd5d56c6
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.f90
@@ -0,0 +1,703 @@
+SUBROUTINE SGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env 
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32     
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, SGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, SGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z. 
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.      
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: SGESVD (the QR SVD algorithm)
+!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F)
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by SGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.   
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.     
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.     
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by SGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by SGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to SGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for SGEQRF[M,N])
+!         MLWDMD = minimal workspace for SGEDMD (see the
+!                  description of LWORK in SGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    SORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for SORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a worskpace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEMM 
+      EXTERNAL      SGEQRF, SLACPY, SLASET, SORGQR, & 
+                    SORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V') 
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for SGEQRF.
+         MLWORK = MIN(M,N) + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL SGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL SORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL SGEQRF( M, N, F, LDF, WORK,         & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL SLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL SLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL SLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL SLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, IWORK,  & 
+                   LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL SLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+                     LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by SGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by SGEDMD.
+        CALL SLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL SLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to SGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL SLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL SLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN  ! Q overwrites F 
+         CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE SGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f
index 923573bdb..42d2b8313 100644
--- a/lapack-netlib/SRC/sgejsv.f
+++ b/lapack-netlib/SRC/sgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL SGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f
index 23816b4c8..82f5c1cf4 100644
--- a/lapack-netlib/SRC/sgelqt3.f
+++ b/lapack-netlib/SRC/sgelqt3.f
@@ -158,7 +158,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f
index f5f17d34c..9fda7b593 100644
--- a/lapack-netlib/SRC/sgelsd.f
+++ b/lapack-netlib/SRC/sgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f
index 9aed4329f..89d3a6e4f 100644
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -202,7 +202,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV,
-     $                   SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR,
+     $                   SGEQRF, SLACPY, SLASCL, SLASET, SORGBR,
      $                   SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@@ -381,7 +381,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -525,7 +524,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -622,7 +621,7 @@
                CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -708,7 +707,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f
index 9c60f78a7..89dd39e80 100644
--- a/lapack-netlib/SRC/sgelsy.f
+++ b/lapack-netlib/SRC/sgelsy.f
@@ -115,6 +115,7 @@
 *>          B is REAL array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index d3b5e3ba1..d271bb757 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f
index ebbfc70b1..6fb328b8c 100644
--- a/lapack-netlib/SRC/sgesvdq.f
+++ b/lapack-netlib/SRC/sgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f
index 23b6f0077..7e5fddeb0 100644
--- a/lapack-netlib/SRC/sgsvj0.f
+++ b/lapack-netlib/SRC/sgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f
index 0345ccb42..f1d1f7239 100644
--- a/lapack-netlib/SRC/sgsvj1.f
+++ b/lapack-netlib/SRC/sgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f
index 499d6bf90..c79330cb5 100644
--- a/lapack-netlib/SRC/sla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f
@@ -644,7 +644,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f
index de05d8eb4..1a19a5071 100644
--- a/lapack-netlib/SRC/sla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gerfsx_extended.f
@@ -628,7 +628,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f
index ada4cad21..9f33b14dd 100644
--- a/lapack-netlib/SRC/sla_porfsx_extended.f
+++ b/lapack-netlib/SRC/sla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_porpvgrw.f b/lapack-netlib/SRC/sla_porpvgrw.f
index a97e7f72a..8064bf7fa 100644
--- a/lapack-netlib/SRC/sla_porpvgrw.f
+++ b/lapack-netlib/SRC/sla_porpvgrw.f
@@ -132,9 +132,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f
index d5096be02..2fa3b1c50 100644
--- a/lapack-netlib/SRC/sla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f
index 16500e74c..cadf53555 100644
--- a/lapack-netlib/SRC/slaed2.f
+++ b/lapack-netlib/SRC/slaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL SCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL SLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL SCOPY( N, DLAMDA, 1, D, 1 )
+         CALL SCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/slaed3.f b/lapack-netlib/SRC/slaed3.f
index e84f22be1..44c601f91 100644
--- a/lapack-netlib/SRC/slaed3.f
+++ b/lapack-netlib/SRC/slaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLAED4, SLASET, XERBLA
@@ -239,30 +231,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +264,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f
index 9c8ba440c..9dd8a15f6 100644
--- a/lapack-netlib/SRC/slaed8.f
+++ b/lapack-netlib/SRC/slaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL SCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL SLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/slaed9.f b/lapack-netlib/SRC/slaed9.f
index 4d07416e9..d1b7b29fd 100644
--- a/lapack-netlib/SRC/slaed9.f
+++ b/lapack-netlib/SRC/slaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       REAL               RHO
 *       ..
 *       .. Array Arguments ..
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       REAL               RHO
 *     ..
 *     .. Array Arguments ..
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/slals0.f b/lapack-netlib/SRC/slals0.f
index 7d44e2864..f168dc653 100644
--- a/lapack-netlib/SRC/slals0.f
+++ b/lapack-netlib/SRC/slals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/slalsa.f b/lapack-netlib/SRC/slalsa.f
index 465455e4a..95becc76e 100644
--- a/lapack-netlib/SRC/slalsa.f
+++ b/lapack-netlib/SRC/slalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> SLALSA is an itermediate step in solving the least squares problem
+*> SLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/slalsd.f b/lapack-netlib/SRC/slalsd.f
index 2197f728e..9943a52d9 100644
--- a/lapack-netlib/SRC/slalsd.f
+++ b/lapack-netlib/SRC/slalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f
index 2e06f9d42..8b2d3286e 100644
--- a/lapack-netlib/SRC/slaqz0.f
+++ b/lapack-netlib/SRC/slaqz0.f
@@ -100,7 +100,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -329,7 +329,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD,
+      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET,
      $            SLARTG, SROT
       REAL, EXTERNAL :: SLAMCH, SLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -479,7 +479,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -564,7 +563,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f
index 7719f2965..f1fdef790 100644
--- a/lapack-netlib/SRC/slarfb_gett.f
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f
index 2e34ca5a6..e1b52c385 100644
--- a/lapack-netlib/SRC/slarre.f
+++ b/lapack-netlib/SRC/slarre.f
@@ -51,7 +51,7 @@
 *> SSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, SLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/slaruv.f b/lapack-netlib/SRC/slaruv.f
index c25dc2e2f..cd37a1c47 100644
--- a/lapack-netlib/SRC/slaruv.f
+++ b/lapack-netlib/SRC/slaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )
diff --git a/lapack-netlib/SRC/slas2.f b/lapack-netlib/SRC/slas2.f
index 6ae86204c..02ecbf434 100644
--- a/lapack-netlib/SRC/slas2.f
+++ b/lapack-netlib/SRC/slas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/slasd0.f b/lapack-netlib/SRC/slasd0.f
index a45f741a9..c8a5c7838 100644
--- a/lapack-netlib/SRC/slasd0.f
+++ b/lapack-netlib/SRC/slasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is REAL array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors,
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is REAL array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT
diff --git a/lapack-netlib/SRC/slasd3.f b/lapack-netlib/SRC/slasd3.f
index f9420f88a..8f74743c2 100644
--- a/lapack-netlib/SRC/slasd3.f
+++ b/lapack-netlib/SRC/slasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to SLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> SLASD3 is called from SLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       REAL               RHO, TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLASCL, SLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL SCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/slasd8.f b/lapack-netlib/SRC/slasd8.f
index 43b171e5f..df5002367 100644
--- a/lapack-netlib/SRC/slasd8.f
+++ b/lapack-netlib/SRC/slasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine SLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( SLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/slasv2.f b/lapack-netlib/SRC/slasv2.f
index 6b98e9a0c..bf5d3ea0e 100644
--- a/lapack-netlib/SRC/slasv2.f
+++ b/lapack-netlib/SRC/slasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index bc7cf343d..95e0ddcce 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB * M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f
index 0761d656f..9765ea3d7 100644
--- a/lapack-netlib/SRC/slatrs.f
+++ b/lapack-netlib/SRC/slatrs.f
@@ -261,6 +261,9 @@
       REAL               BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      REAL               WORK (1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
index c3a08e524..8f0c4bf16 100644
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -574,7 +574,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index 33966c01f..86733bb15 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/sorbdb1.f b/lapack-netlib/SRC/sorbdb1.f
index c860f4366..191e5742a 100644
--- a/lapack-netlib/SRC/sorbdb1.f
+++ b/lapack-netlib/SRC/sorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f
index 484d352f8..b2ff34bb1 100644
--- a/lapack-netlib/SRC/sorbdb2.f
+++ b/lapack-netlib/SRC/sorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb3.f b/lapack-netlib/SRC/sorbdb3.f
index 6209b24ee..99478c5d0 100644
--- a/lapack-netlib/SRC/sorbdb3.f
+++ b/lapack-netlib/SRC/sorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f
index bf60fb7bb..0fef5b759 100644
--- a/lapack-netlib/SRC/sorbdb4.f
+++ b/lapack-netlib/SRC/sorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> SORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f
index a755fa4a4..692eba1d9 100644
--- a/lapack-netlib/SRC/sorgtsqr.f
+++ b/lapack-netlib/SRC/sorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -301,4 +302,4 @@
 *
 *     End of SORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/spbsv.f b/lapack-netlib/SRC/spbsv.f
index 2d084424e..8929321ea 100644
--- a/lapack-netlib/SRC/spbsv.f
+++ b/lapack-netlib/SRC/spbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/spbsvx.f b/lapack-netlib/SRC/spbsvx.f
index 27907eb85..6abb6caae 100644
--- a/lapack-netlib/SRC/spbsvx.f
+++ b/lapack-netlib/SRC/spbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spbtf2.f b/lapack-netlib/SRC/spbtf2.f
index ff08bc327..b5aac1ccd 100644
--- a/lapack-netlib/SRC/spbtf2.f
+++ b/lapack-netlib/SRC/spbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spbtrf.f b/lapack-netlib/SRC/spbtrf.f
index ef5dcbb98..d3ae216e5 100644
--- a/lapack-netlib/SRC/spbtrf.f
+++ b/lapack-netlib/SRC/spbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spftrf.f b/lapack-netlib/SRC/spftrf.f
index 9e2c11eea..0ed0e3abd 100644
--- a/lapack-netlib/SRC/spftrf.f
+++ b/lapack-netlib/SRC/spftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sposv.f b/lapack-netlib/SRC/sposv.f
index fe0a35a56..336332d2d 100644
--- a/lapack-netlib/SRC/sposv.f
+++ b/lapack-netlib/SRC/sposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sposvx.f b/lapack-netlib/SRC/sposvx.f
index bcf38c7ea..0770897d2 100644
--- a/lapack-netlib/SRC/sposvx.f
+++ b/lapack-netlib/SRC/sposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f
index 08adf1d58..19e599f64 100644
--- a/lapack-netlib/SRC/sposvxx.f
+++ b/lapack-netlib/SRC/sposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/spotf2.f b/lapack-netlib/SRC/spotf2.f
index 5b3504834..773b768b2 100644
--- a/lapack-netlib/SRC/spotf2.f
+++ b/lapack-netlib/SRC/spotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf.f b/lapack-netlib/SRC/spotrf.f
index 5d5771c86..12ef58a40 100644
--- a/lapack-netlib/SRC/spotrf.f
+++ b/lapack-netlib/SRC/spotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf2.f b/lapack-netlib/SRC/spotrf2.f
index ae0484ce1..ef731ffa2 100644
--- a/lapack-netlib/SRC/spotrf2.f
+++ b/lapack-netlib/SRC/spotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sppsv.f b/lapack-netlib/SRC/sppsv.f
index 2d3fb3d91..1f48dd4db 100644
--- a/lapack-netlib/SRC/sppsv.f
+++ b/lapack-netlib/SRC/sppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sppsvx.f b/lapack-netlib/SRC/sppsvx.f
index 7d71efcd5..bd2da20ee 100644
--- a/lapack-netlib/SRC/sppsvx.f
+++ b/lapack-netlib/SRC/sppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spptrf.f b/lapack-netlib/SRC/spptrf.f
index 0f90399cc..be51c4824 100644
--- a/lapack-netlib/SRC/spptrf.f
+++ b/lapack-netlib/SRC/spptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spteqr.f b/lapack-netlib/SRC/spteqr.f
index 0d6bf911b..6e6e9aa45 100644
--- a/lapack-netlib/SRC/spteqr.f
+++ b/lapack-netlib/SRC/spteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/sptsv.f b/lapack-netlib/SRC/sptsv.f
index 46aadf4c3..f11c22594 100644
--- a/lapack-netlib/SRC/sptsv.f
+++ b/lapack-netlib/SRC/sptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sptsvx.f b/lapack-netlib/SRC/sptsvx.f
index 8dd04b054..eaa691cef 100644
--- a/lapack-netlib/SRC/sptsvx.f
+++ b/lapack-netlib/SRC/sptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spttrf.f b/lapack-netlib/SRC/spttrf.f
index 2217a7338..e083b7456 100644
--- a/lapack-netlib/SRC/spttrf.f
+++ b/lapack-netlib/SRC/spttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f
index bcf14ce85..e87f9a030 100644
--- a/lapack-netlib/SRC/ssbevd.f
+++ b/lapack-netlib/SRC/ssbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f
index 9687ee024..014bade48 100644
--- a/lapack-netlib/SRC/ssbevd_2stage.f
+++ b/lapack-netlib/SRC/ssbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f
index 6dd1fe952..7c21ee455 100644
--- a/lapack-netlib/SRC/ssbgvd.f
+++ b/lapack-netlib/SRC/ssbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f
index 271f35964..3607fae82 100644
--- a/lapack-netlib/SRC/ssbgvx.f
+++ b/lapack-netlib/SRC/ssbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  SPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f
index 56329da34..0872e95ac 100644
--- a/lapack-netlib/SRC/sspevd.f
+++ b/lapack-netlib/SRC/sspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sspgv.f b/lapack-netlib/SRC/sspgv.f
index c73e94e60..e8bc66e5d 100644
--- a/lapack-netlib/SRC/sspgv.f
+++ b/lapack-netlib/SRC/sspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 73862ed1b..1a88365f2 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +178,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgvx.f b/lapack-netlib/SRC/sspgvx.f
index de581543a..6d5b4ed3d 100644
--- a/lapack-netlib/SRC/sspgvx.f
+++ b/lapack-netlib/SRC/sspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f
index 925b03422..61e3c2fda 100644
--- a/lapack-netlib/SRC/sstedc.f
+++ b/lapack-netlib/SRC/sstedc.f
@@ -42,12 +42,6 @@
 *> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sstegr.f b/lapack-netlib/SRC/sstegr.f
index 2e2975fdf..2967a6969 100644
--- a/lapack-netlib/SRC/sstegr.f
+++ b/lapack-netlib/SRC/sstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : SSTEGR and SSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f
index 3a9bbe784..2ed697b69 100644
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -378,6 +380,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -500,6 +503,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -507,8 +519,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -531,8 +548,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f
index bc5b5aaab..218af8c76 100644
--- a/lapack-netlib/SRC/sstevd.f
+++ b/lapack-netlib/SRC/sstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f
index 6defc1f0e..af55da51a 100644
--- a/lapack-netlib/SRC/ssyconvf.f
+++ b/lapack-netlib/SRC/ssyconvf.f
@@ -39,7 +39,7 @@
 *> SSYTRF provided on entry in parameter A into the factorization
 *> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF into
+*> the interchanges stored in IPIV from the format used in SSYTRF into
 *> the format used in SSYTRF_RK (or SSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or SSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in SSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF_RK
+*> the interchanges stored in IPIV from the format used in SSYTRF_RK
 *> (or SSYTRF_BK) into the format used in SSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f
index c59f257bb..efd7c57fe 100644
--- a/lapack-netlib/SRC/ssyconvf_rook.f
+++ b/lapack-netlib/SRC/ssyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index ac0d0284d..ee0e33384 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, SSYEVD needs N**2 more
 *> workspace than SSYEVX.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f
index f3fde6b4a..e63e280a7 100644
--- a/lapack-netlib/SRC/ssyevd_2stage.f
+++ b/lapack-netlib/SRC/ssyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f
index 270957fce..f39947d92 100644
--- a/lapack-netlib/SRC/ssygv.f
+++ b/lapack-netlib/SRC/ssygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f
index 49f357d90..3d9a44b5e 100644
--- a/lapack-netlib/SRC/ssygv_2stage.f
+++ b/lapack-netlib/SRC/ssygv_2stage.f
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 7c7e0de01..3c8bd2a0e 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -190,7 +184,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f
index e93da60cc..344075c9f 100644
--- a/lapack-netlib/SRC/ssygvx.f
+++ b/lapack-netlib/SRC/ssygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index b9eac8342..aa862f14b 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index faeeff5f4..b8386670a 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -506,7 +506,7 @@
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index d188589b9..31e38e466 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index 13e849cdc..4ba026fc8 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index 94b3aa4b5..07357f2ab 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index ec84fcb1b..8e1ef460a 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/stgevc.f b/lapack-netlib/SRC/stgevc.f
index 15fc88c4b..be4cb1829 100644
--- a/lapack-netlib/SRC/stgevc.f
+++ b/lapack-netlib/SRC/stgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, SLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMV, SLABAD, SLACPY, SLAG2, SLALN2, XERBLA
+      EXTERNAL           SGEMV, SLACPY, SLAG2, SLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f
index 40f822915..430f3c4b7 100644
--- a/lapack-netlib/SRC/stgsna.f
+++ b/lapack-netlib/SRC/stgsna.f
@@ -632,8 +632,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *
diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f
index 5af57123b..253cbc24c 100644
--- a/lapack-netlib/SRC/strevc3.f
+++ b/lapack-netlib/SRC/strevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f
index 28762c2ed..ef3f2da83 100644
--- a/lapack-netlib/SRC/strsyl3.f
+++ b/lapack-netlib/SRC/strsyl3.f
@@ -1223,7 +1223,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f
index 2ad6142a7..faedafc3c 100644
--- a/lapack-netlib/SRC/zbdsqr.f
+++ b/lapack-netlib/SRC/zbdsqr.f
@@ -259,7 +259,7 @@
      $                   NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +372,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -426,7 +426,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +434,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +515,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +543,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +560,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f
index 9b1940d60..649d2c049 100644
--- a/lapack-netlib/SRC/zcposv.f
+++ b/lapack-netlib/SRC/zcposv.f
@@ -187,10 +187,10 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of
-*>                (COMPLEX*16) A is not positive definite, so the
-*>                factorization could not be completed, and the solution
-*>                has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (COMPLEX*16) A is not positive, so the factorization
+*>                could not be completed, and the solution has not been
+*>                computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zgebal.f b/lapack-netlib/SRC/zgebal.f
index d4a9e39f1..a467991d4 100644
--- a/lapack-netlib/SRC/zgebal.f
+++ b/lapack-netlib/SRC/zgebal.f
@@ -89,7 +89,7 @@
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
-*>          ILO and IHI are set to INTEGER such that on exit
+*>          ILO and IHI are set to integers such that on exit
 *>          A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
 *>          If JOB = 'N' or 'S', ILO = 1 and IHI = N.
 *> \endverbatim
@@ -155,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -184,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -217,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL ZSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL ZSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( DBLE( A( J, I ) ).NE.ZERO .OR. DIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL ZSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL ZSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL ZSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL ZSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( DBLE( A( I, J ) ).NE.ZERO .OR. DIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = DZNRM2( L-K+1, A( K, I ), 1 )
-         R = DZNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IZAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IZAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'ZGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL ZDSCAL( L, F, A( 1, I ), 1 )
+            C = DZNRM2( L-K+1, A( K, I ), 1 )
+            R = DZNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IZAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IZAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'ZGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL ZDSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c
new file mode 100644
index 000000000..c1b39ba3e
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.c
@@ -0,0 +1,1676 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int zgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublecomplex *x, integer *
+	ldx, doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *w, 
+	integer *ldw, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *rwork, integer *lrwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    doublereal d__1, d__2;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    doublecomplex zone;
+    doublereal zero, ssum;
+    integer info1, info2;
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    doublereal scale;
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int zgeev_(char *, char *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *);
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_(
+	    char *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zgesdd_(char *, integer *, integer *,
+	     doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *, integer *), zlascl_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *, doublecomplex *, integer *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int zgesvd_(char *, char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *), zlacpy_(char *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    doublereal rdummy[2];
+    extern /* Subroutine */ int zgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublecomplex *, integer *, 
+	    doublereal *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublereal *, integer *, 
+	    integer *, integer *), zlassq_(integer *, doublecomplex *, integer *, 
+	    doublereal *, doublereal *), mecago_();
+    integer lwrsvq, mwrsvq;
+    doublereal ofl, one;
+    extern /* Subroutine */ int zgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    , integer *, integer *, doublecomplex *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), */
+/*     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where */
+/*     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+/* Possible bug in ZGESDD optimal workspace size. */
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the ZGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (doublereal) mlrwrk;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using ZLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DZNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC */
+/* LAPACK CALL */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("ZGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&y[izamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    zdscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using ZLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = DZNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&x[izamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	zlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	zlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case ZGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    zlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / rwork[i__];
+	    zdscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that ZGESVD, ZGESVDQ and ZGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		z__2.r = rwork[i__4], z__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		z__1.r = z__2.r * w[i__5].r - z__2.i * w[i__5].i, z__1.i = 
+			z__2.r * w[i__5].i + z__2.i * w[i__5].r;
+		w[i__3].r = z__1.r, w[i__3].i = z__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside ZGEDMD). */
+	zgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLA */
+	zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	zgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLA */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	zgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) */
+
+	zgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		zlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		zlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    zgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of ZGEEV. */
+/* LAPACK CALL */
+    if (info1 > 0) {
+/* ZGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		zgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		zlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    zgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		z__1.r = -eigs[i__2].r, z__1.i = -eigs[i__2].i;
+		zaxpy_(m, &z__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTR */
+/* BLAS */
+		res[i__] = dznrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+/* BLAS */
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* zgedmd_ */
+
diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90
new file mode 100644
index 000000000..090641ad8
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.f90
@@ -0,0 +1,996 @@
+      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, ZGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, ZGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!............................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: ZGESVD (the QR SVD algorithm)
+!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!   < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, Y contains the data snapshot matrix Y
+!   < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient.
+!     See the descriptions of EIGS, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs,
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.
+!.....
+!     B (output) COMPLEX(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1:K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient.
+!     The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by ZGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!     ZWORK is used as complex workspace in the complex SVD, as
+!     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
+!     the eigenvalues of a Rayleigh quotient.
+!     If the call to ZGEDMD is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length.
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.
+!.....
+!     LZWORK (input) INTEGER
+!     The minimal length of the workspace vector ZWORK.
+!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
+!     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
+!     LZWORK_SVD is calculated as follows
+!     If WHTSVD == 1 :: ZGESVD ::
+!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!     If WHTSVD == 2 :: ZGESDD ::
+!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!     If WHTSVD == 3 :: ZGESVDQ ::
+!        LZWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: ZGEJSV ::
+!        LZWORK_SVD = obtainable by a query
+!     If on entry LZWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths and returns them in
+!     LZWORK(1) and LZWORK(2), respectively.
+!.....
+!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!     On exit, RWORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to ZGEDMD is only workspace query, then
+!     RWORK(1) contains the minimal workspace length.
+!     See the description of LRWORK.
+!.....
+!     LRWORK (input) INTEGER
+!     The minimal length of the workspace vector RWORK.
+!     LRWORK is calculated as follows:
+!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
+!     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!     for the SVD subroutine determined by the input parameter
+!     WHTSVD.
+!     If WHTSVD == 1 :: ZGESVD ::
+!        LRWORK_SVD = 5*MIN(M,N)
+!     If WHTSVD == 2 :: ZGESDD ::
+!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!     If WHTSVD == 3 :: ZGESVDQ ::
+!        LRWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: ZGEJSV ::
+!        LRWORK_SVD = obtainable by a query
+!     If on entry LRWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     real workspace length and returns it in RWORK(1).
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for  ZWORK, RWORK and
+!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP),    PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP),    PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,    &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j,  IMINWR,  INFO1, INFO2,  &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                        LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                        MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                        OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY,  &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2
+      EXTERNAL      ZLANGE, DLAMCH, DZNRM2, IZAMAX
+      INTEGER                               IZAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZAXPY,  ZGEMM,  ZDSCAL
+      EXTERNAL      ZGEEV,  ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, &
+                    ZLACPY, ZLASCL, ZLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B,LDB,&
+                     W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD,INT( ZWORK(1) ))
+                ! Possible bug in ZGESDD optimal workspace size.
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK  = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the ZGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL ZGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 )
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using ZLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DZNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )      ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+             ! All columns of X are zero. Return error code -8.
+             ! (the 8th input variable had an illegal value)
+             K = 0
+             INFO = -8
+             CALL XERBLA('ZGEDMD',-INFO)
+             RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(IZAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL ZDSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using ZLASSQ.
+          DO i = 1, N
+            !RWORK(i) = DZNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )             ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(IZAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL ZLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL ZLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case ZGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL ZLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('ZGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL ZDSCAL( N, ONE/RWORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that ZGESVD, ZGESVDQ and ZGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside ZGEDMD).
+          CALL ZGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='C'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL ZGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL ZGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                         ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL ZGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL ZLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL ZLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL ZGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, LDW, &
+            W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of ZGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! ZGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL ZGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                       LDW, ZZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL ZLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+            !           LDS, ZZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the Ritz vectors
+      IF ( WNTVEC ) CALL ZGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                   ZZERO, Z, LDZ )                          ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         DO i = 1, K
+            CALL ZAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+            ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+            RES(i) = DZNRM2( M, Y(1,i), 1 )                       ! BLAS CALL
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE ZGEDMD
+
diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c
new file mode 100644
index 000000000..1815f0814
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.c
@@ -0,0 +1,1293 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int zgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublecomplex *f, integer *ldf, doublecomplex *x, integer *ldx, 
+	doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *v, 
+	integer *ldv, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *work, integer *lwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, integer *, doublereal *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublereal *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublereal *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    ), zlacpy_(char *, integer *, integer *, doublecomplex *, integer 
+	    *, doublecomplex *, integer *), zlaset_(char *, integer *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, integer *), zunmqr_(char *, char *, integer *, integer 
+	    *, integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by ZGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by ZGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by ZGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for ZGEDMD (see the */
+/*                  description of LWORK in ZGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX(LZWORK, MINMN+MLWMQR) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX(ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in ZGEDMD, because in ZGEDMDQ */
+/*     only ZGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace length for WORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lzwork == -1 || *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for ZGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], &c_n1, &work[1], &c_n1, &iwork[1], 
+		&c_n1, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	work[1] = (doublereal) mlrwrk;
+	work[2] = (doublereal) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >> N, at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    zlaset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	zlaset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See ZGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by ZGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by ZGEDMD. */
+	zlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to ZGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	zlaset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	zlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* zgedmdq_ */
+
diff --git a/lapack-netlib/SRC/zgedmdq.f90 b/lapack-netlib/SRC/zgedmdq.f90
new file mode 100644
index 000000000..51be72a32
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE ZGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, ZGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, ZGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: ZGESVD (the QR SVD algorithm)
+!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by ZGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by ZGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.   
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by ZGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by ZGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to ZGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N])
+!         MLWDMD = minimal workspace for ZGEDMD (see the
+!                  description of LWORK in ZGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX(LZWORK, MINMN+MLWMQR)
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX(ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to ZGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in ZGEDMD, because in ZGEDMDQ
+!     only ZGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1). 
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace length for WORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEQRF, ZLACPY, ZLASET, ZUNGQR, & 
+                    ZUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q') 
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( (LZWORK == -1) .OR. (LWORK == -1) .OR. (LIWORK == -1) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+               ZWORK(1) = 2
+               ZWORK(2) = 2
+               WORK(1)  = 2
+               WORK(2)  = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF   
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for ZGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+         
+         IF ( LQUERY ) THEN 
+             CALL ZGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)         
+         END IF
+         CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, -1, WORK, -1, IWORK,&
+                      -1, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL ZUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK,MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK,MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,MINMN+OLWGQR)
+            END IF            
+         END IF         
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >> N, at this place 
+!     one can use an out of core QRF.
+!   
+      CALL ZGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL ZLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL ZLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL ZLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL ZLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See ZGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL ZLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by ZGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by ZGEDMD.
+        CALL ZLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL ZLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to ZGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL ZLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL ZLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE ZGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index 5134ea891..b4bc531ab 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored. 
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is DOUBLE PRECISION array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1821,7 +1821,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1844,7 +1844,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1888,7 +1888,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f
index 629a09472..1a71dc44e 100644
--- a/lapack-netlib/SRC/zgelqt3.f
+++ b/lapack-netlib/SRC/zgelqt3.f
@@ -174,7 +174,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/SRC/zgelsd.f b/lapack-netlib/SRC/zgelsd.f
index 01793e16c..b5bc768e8 100644
--- a/lapack-netlib/SRC/zgelsd.f
+++ b/lapack-netlib/SRC/zgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f
index be53ba95b..afdbaecf0 100644
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -212,10 +212,9 @@
       COMPLEX*16         DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY,
-     $                   ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF,
-     $                   ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ,
-     $                   ZUNMQR
+      EXTERNAL           DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL,
+     $                   ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY,
+     $                   ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
@@ -388,7 +387,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/zgelsy.f b/lapack-netlib/SRC/zgelsy.f
index 65fa87ae9..497becf8b 100644
--- a/lapack-netlib/SRC/zgelsy.f
+++ b/lapack-netlib/SRC/zgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX*16 array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f
index 30d18a3a0..7f203afa5 100644
--- a/lapack-netlib/SRC/zgesdd.f
+++ b/lapack-netlib/SRC/zgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f
index 05eb722f8..b990f7389 100644
--- a/lapack-netlib/SRC/zgesvdq.f
+++ b/lapack-netlib/SRC/zgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/zgetf2.f b/lapack-netlib/SRC/zgetf2.f
index c247f8645..7c63dbbee 100644
--- a/lapack-netlib/SRC/zgetf2.f
+++ b/lapack-netlib/SRC/zgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO )
@@ -127,7 +127,7 @@
 *     ..
 *     .. Local Scalars ..
       DOUBLE PRECISION   SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -135,7 +135,7 @@
       EXTERNAL           DLAMCH, IZAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZGERU, ZSCAL, ZSWAP
+      EXTERNAL           XERBLA, ZGERU, ZRSCL, ZSWAP
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -181,15 +181,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f
index 672ab7590..075c243c2 100644
--- a/lapack-netlib/SRC/zgsvj0.f
+++ b/lapack-netlib/SRC/zgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f
index 2461a9555..ba71e155f 100644
--- a/lapack-netlib/SRC/zgsvj1.f
+++ b/lapack-netlib/SRC/zgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhbevd.f b/lapack-netlib/SRC/zhbevd.f
index 0db551540..be9f01556 100644
--- a/lapack-netlib/SRC/zhbevd.f
+++ b/lapack-netlib/SRC/zhbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbevd_2stage.f b/lapack-netlib/SRC/zhbevd_2stage.f
index 4522d5e79..e32c7125c 100644
--- a/lapack-netlib/SRC/zhbevd_2stage.f
+++ b/lapack-netlib/SRC/zhbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbgvd.f b/lapack-netlib/SRC/zhbgvd.f
index b0664750e..4bd02168d 100644
--- a/lapack-netlib/SRC/zhbgvd.f
+++ b/lapack-netlib/SRC/zhbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index 7f58c7f72..ba52f9e72 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevd_2stage.f b/lapack-netlib/SRC/zheevd_2stage.f
index 9859b0d67..e697a9823 100644
--- a/lapack-netlib/SRC/zheevd_2stage.f
+++ b/lapack-netlib/SRC/zheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhegv.f b/lapack-netlib/SRC/zhegv.f
index 41657e3be..c973bd0fc 100644
--- a/lapack-netlib/SRC/zhegv.f
+++ b/lapack-netlib/SRC/zhegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegv_2stage.f b/lapack-netlib/SRC/zhegv_2stage.f
index fda651e5e..91ac09311 100644
--- a/lapack-netlib/SRC/zhegv_2stage.f
+++ b/lapack-netlib/SRC/zhegv_2stage.f
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f
index eeda656ad..c9ff55e3d 100644
--- a/lapack-netlib/SRC/zhegvd.f
+++ b/lapack-netlib/SRC/zhegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -212,7 +206,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvx.f b/lapack-netlib/SRC/zhegvx.f
index ac9f9ef1a..71ed1c4ca 100644
--- a/lapack-netlib/SRC/zhegvx.f
+++ b/lapack-netlib/SRC/zhegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index ee1596855..79c01c546 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f
index 050c7993d..87df901aa 100644
--- a/lapack-netlib/SRC/zhetf2_rk.f
+++ b/lapack-netlib/SRC/zhetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetf2_rook.f b/lapack-netlib/SRC/zhetf2_rook.f
index 94bb29736..91172f601 100644
--- a/lapack-netlib/SRC/zhetf2_rook.f
+++ b/lapack-netlib/SRC/zhetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index e839271a4..1d39ac942 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -538,7 +538,7 @@ C                 END IF
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index 56722e7e6..537c16e8c 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 92f1c09b3..477602b5e 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhpevd.f b/lapack-netlib/SRC/zhpevd.f
index 7625c8fe8..5260aaf14 100644
--- a/lapack-netlib/SRC/zhpevd.f
+++ b/lapack-netlib/SRC/zhpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhpgv.f b/lapack-netlib/SRC/zhpgv.f
index 72876b6e4..b92168555 100644
--- a/lapack-netlib/SRC/zhpgv.f
+++ b/lapack-netlib/SRC/zhpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f
index e96e39738..dfe92067c 100644
--- a/lapack-netlib/SRC/zhpgvd.f
+++ b/lapack-netlib/SRC/zhpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -205,7 +199,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvx.f b/lapack-netlib/SRC/zhpgvx.f
index 94d7f7733..de75b486b 100644
--- a/lapack-netlib/SRC/zhpgvx.f
+++ b/lapack-netlib/SRC/zhpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f
index fe4d635b1..2f57b7682 100644
--- a/lapack-netlib/SRC/zla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f
index 9d618f294..22e45c5a9 100644
--- a/lapack-netlib/SRC/zla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gerfsx_extended.f
@@ -636,7 +636,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f
index a55dd9431..689460a02 100644
--- a/lapack-netlib/SRC/zla_herfsx_extended.f
+++ b/lapack-netlib/SRC/zla_herfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f
index 12e05e049..e853494fc 100644
--- a/lapack-netlib/SRC/zla_porfsx_extended.f
+++ b/lapack-netlib/SRC/zla_porfsx_extended.f
@@ -626,7 +626,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f
index 9b381a072..897589aa0 100644
--- a/lapack-netlib/SRC/zla_porpvgrw.f
+++ b/lapack-netlib/SRC/zla_porpvgrw.f
@@ -142,9 +142,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f
index d6c241499..fb1b9e2d7 100644
--- a/lapack-netlib/SRC/zla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_syrfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zlaed7.f b/lapack-netlib/SRC/zlaed7.f
index 83f32d8b8..86e5ec6b5 100644
--- a/lapack-netlib/SRC/zlaed7.f
+++ b/lapack-netlib/SRC/zlaed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/zlaed8.f b/lapack-netlib/SRC/zlaed8.f
index 995a673de..003725820 100644
--- a/lapack-netlib/SRC/zlaed8.f
+++ b/lapack-netlib/SRC/zlaed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by DLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complex16OTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL ZCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL ZLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/zlals0.f b/lapack-netlib/SRC/zlals0.f
index 7a7310042..79c0cf5e4 100644
--- a/lapack-netlib/SRC/zlals0.f
+++ b/lapack-netlib/SRC/zlals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/zlalsa.f b/lapack-netlib/SRC/zlalsa.f
index 73ccf3a8b..d419598d2 100644
--- a/lapack-netlib/SRC/zlalsa.f
+++ b/lapack-netlib/SRC/zlalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> ZLALSA is an itermediate step in solving the least squares problem
+*> ZLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/zlalsd.f b/lapack-netlib/SRC/zlalsd.f
index dca308e56..1d7358aa9 100644
--- a/lapack-netlib/SRC/zlalsd.f
+++ b/lapack-netlib/SRC/zlalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f
index 3e20200ed..dcb28850a 100644
--- a/lapack-netlib/SRC/zlaqz0.f
+++ b/lapack-netlib/SRC/zlaqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -312,7 +312,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD,
+      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET,
      $            ZLARTG, ZROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -464,7 +464,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -535,7 +534,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f
index 4a3c4dcf1..17d4b33aa 100644
--- a/lapack-netlib/SRC/zlarfb_gett.f
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index e4e703343..be4c48539 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f
index 25e71edce..41714a3e0 100644
--- a/lapack-netlib/SRC/zlatdf.f
+++ b/lapack-netlib/SRC/zlatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + DBLE( ZDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
index fc1be0517..231a17274 100644
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -577,7 +577,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2 - K1
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index ffdbc68c0..8c938aebc 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/zpbsv.f b/lapack-netlib/SRC/zpbsv.f
index fe6baf8b5..ef212bbc3 100644
--- a/lapack-netlib/SRC/zpbsv.f
+++ b/lapack-netlib/SRC/zpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zpbsvx.f b/lapack-netlib/SRC/zpbsvx.f
index 1efd2fd1e..724102376 100644
--- a/lapack-netlib/SRC/zpbsvx.f
+++ b/lapack-netlib/SRC/zpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpbtf2.f b/lapack-netlib/SRC/zpbtf2.f
index 7b90af47e..39bf6c3fa 100644
--- a/lapack-netlib/SRC/zpbtf2.f
+++ b/lapack-netlib/SRC/zpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpbtrf.f b/lapack-netlib/SRC/zpbtrf.f
index 61cd1d434..80afb0836 100644
--- a/lapack-netlib/SRC/zpbtrf.f
+++ b/lapack-netlib/SRC/zpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpftrf.f b/lapack-netlib/SRC/zpftrf.f
index 6d18f2cb3..4d4a5116e 100644
--- a/lapack-netlib/SRC/zpftrf.f
+++ b/lapack-netlib/SRC/zpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/zposv.f b/lapack-netlib/SRC/zposv.f
index 3bb625876..0e91cde2f 100644
--- a/lapack-netlib/SRC/zposv.f
+++ b/lapack-netlib/SRC/zposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zposvx.f b/lapack-netlib/SRC/zposvx.f
index f9e9b1d5f..6c06dbd57 100644
--- a/lapack-netlib/SRC/zposvx.f
+++ b/lapack-netlib/SRC/zposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f
index 6735fab71..3886c66f0 100644
--- a/lapack-netlib/SRC/zposvxx.f
+++ b/lapack-netlib/SRC/zposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/zpotf2.f b/lapack-netlib/SRC/zpotf2.f
index eb88d617c..b48f3d654 100644
--- a/lapack-netlib/SRC/zpotf2.f
+++ b/lapack-netlib/SRC/zpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf.f b/lapack-netlib/SRC/zpotrf.f
index 3b7018276..3edae84fb 100644
--- a/lapack-netlib/SRC/zpotrf.f
+++ b/lapack-netlib/SRC/zpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f
index 859ddc75f..67430f231 100644
--- a/lapack-netlib/SRC/zpotrf2.f
+++ b/lapack-netlib/SRC/zpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zppsv.f b/lapack-netlib/SRC/zppsv.f
index 19536e204..f466266e4 100644
--- a/lapack-netlib/SRC/zppsv.f
+++ b/lapack-netlib/SRC/zppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zppsvx.f b/lapack-netlib/SRC/zppsvx.f
index f94badf78..60d07cbc7 100644
--- a/lapack-netlib/SRC/zppsvx.f
+++ b/lapack-netlib/SRC/zppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpptrf.f b/lapack-netlib/SRC/zpptrf.f
index a34d63913..a103e5eca 100644
--- a/lapack-netlib/SRC/zpptrf.f
+++ b/lapack-netlib/SRC/zpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpteqr.f b/lapack-netlib/SRC/zpteqr.f
index a81a6ad94..897136c76 100644
--- a/lapack-netlib/SRC/zpteqr.f
+++ b/lapack-netlib/SRC/zpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/zptsv.f b/lapack-netlib/SRC/zptsv.f
index d53e17a34..191adaff6 100644
--- a/lapack-netlib/SRC/zptsv.f
+++ b/lapack-netlib/SRC/zptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zptsvx.f b/lapack-netlib/SRC/zptsvx.f
index 4a40768f1..94409a7a1 100644
--- a/lapack-netlib/SRC/zptsvx.f
+++ b/lapack-netlib/SRC/zptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpttrf.f b/lapack-netlib/SRC/zpttrf.f
index a106ec419..75ef847c4 100644
--- a/lapack-netlib/SRC/zpttrf.f
+++ b/lapack-netlib/SRC/zpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zrscl.c b/lapack-netlib/SRC/zrscl.c
new file mode 100644
index 000000000..2264b5465
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZDRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX*16         A */
+/*       COMPLEX*16         X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX*16 array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector SX. */
+/* >          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, 
+	integer *incx)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal absi, absr;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *);
+    doublereal ai, ar;
+    extern doublereal dlamch_(char *);
+    doublereal ui, ov, ur, safmin, safmax;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zdrscl_(integer *, doublereal *, 
+	    doublecomplex *, integer *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = dlamch_("S");
+    safmax = 1. / safmin;
+    ov = dlamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = d_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.) {
+/*        If alpha is real, then we can use csrscl */
+	zdrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    zdscal_(n, &safmin, &x[1], incx);
+	    d__1 = -safmax / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    d__1 = -safmin / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else {
+	    d__1 = -1. / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    d__1 = safmin / ur;
+	    d__2 = -safmin / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		d__1 = 1. / ur;
+		d__2 = -1. / ui;
+		z__1.r = d__1, z__1.i = d__2;
+		zscal_(n, &z__1, &x[1], incx);
+	    } else {
+		zdscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    d__1 = 1. / ur;
+		    d__2 = -1. / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		} else {
+		    d__1 = safmax / ur;
+		    d__2 = -safmax / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    d__1 = 1. / ur;
+	    d__2 = -1. / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of ZRSCL */
+
+} /* zrscl_ */
+
diff --git a/lapack-netlib/SRC/zrscl.f b/lapack-netlib/SRC/zrscl.f
new file mode 100644
index 000000000..970f6de75
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.f
@@ -0,0 +1,203 @@
+*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZDRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX*16         A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector SX.
+*>          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX*16         A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      COMPLEX*16         ZLADIV
+      EXTERNAL           DLAMCH, ZLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, ZDSCAL, ZDRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = DLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = DLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = DBLE( A )
+      AI = DIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL ZDRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL ZDSCAL( N, SAFMIN, X, INCX )
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X,
+     $                  INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL ZDSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X,
+     $                        INCX )
+               ELSE
+                  CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZRSCL
+*
+      END
diff --git a/lapack-netlib/SRC/zstedc.f b/lapack-netlib/SRC/zstedc.f
index 74d390af7..e62063a19 100644
--- a/lapack-netlib/SRC/zstedc.f
+++ b/lapack-netlib/SRC/zstedc.f
@@ -43,12 +43,6 @@
 *> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zstegr.f b/lapack-netlib/SRC/zstegr.f
index ee43a0d21..3736a0517 100644
--- a/lapack-netlib/SRC/zstegr.f
+++ b/lapack-netlib/SRC/zstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : ZSTEGR and ZSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f
index b034198de..4eaf5ef97 100644
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -330,6 +330,7 @@
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
 *> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f
index eb49b0f3d..0958a5f77 100644
--- a/lapack-netlib/SRC/zsyconvf.f
+++ b/lapack-netlib/SRC/zsyconvf.f
@@ -39,7 +39,7 @@
 *> ZSYTRF provided on entry in parameter A into the factorization
 *> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF into
+*> the interchanges stored in IPIV from the format used in ZSYTRF into
 *> the format used in ZSYTRF_RK (or ZSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or ZSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in ZSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF_RK
+*> the interchanges stored in IPIV from the format used in ZSYTRF_RK
 *> (or ZSYTRF_BK) into the format used in ZSYTRF.
 *>
 *> ZSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f
index 3cfa694c3..62cca060b 100644
--- a/lapack-netlib/SRC/zsyconvf_rook.f
+++ b/lapack-netlib/SRC/zsyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f
index 4f19630a9..701d73a38 100644
--- a/lapack-netlib/SRC/zsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/zsysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f
index a775a8758..dc9016c69 100644
--- a/lapack-netlib/SRC/zsytrf.f
+++ b/lapack-netlib/SRC/zsytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f
index 73d257cfe..ddb19925a 100644
--- a/lapack-netlib/SRC/zsytrf_aa.f
+++ b/lapack-netlib/SRC/zsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f
index b731cb7a2..95b9fda0d 100644
--- a/lapack-netlib/SRC/zsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f
index 3b398ce6c..af8b8d501 100644
--- a/lapack-netlib/SRC/zsytrf_rk.f
+++ b/lapack-netlib/SRC/zsytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ztgevc.f b/lapack-netlib/SRC/ztgevc.f
index 23bd36ddb..6dcbe582f 100644
--- a/lapack-netlib/SRC/ztgevc.f
+++ b/lapack-netlib/SRC/ztgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX*16 array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by ZHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, DLAMCH, ZLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEMV
+      EXTERNAL           XERBLA, ZGEMV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN
@@ -367,7 +367,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f
index ee26b5e7b..0cae8939e 100644
--- a/lapack-netlib/SRC/ztgsy2.f
+++ b/lapack-netlib/SRC/ztgsy2.f
@@ -57,7 +57,7 @@
 *>        Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
 *>            [ kron(In, D)  -kron(E**H, Im) ],
 *>
-*> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X.
+*> Ik is the identity matrix of size k and X**H is the conjugate transpose of X.
 *> kron(X, Y) is the Kronecker product between the matrices X and Y.
 *>
 *> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b
diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f
index 6300e80ae..8fb144e0c 100644
--- a/lapack-netlib/SRC/ztrevc3.f
+++ b/lapack-netlib/SRC/ztrevc3.f
@@ -321,9 +321,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
-      RWORK(1) = N
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/ztrexc.f b/lapack-netlib/SRC/ztrexc.f
index b92e63efa..32aae51e4 100644
--- a/lapack-netlib/SRC/ztrexc.f
+++ b/lapack-netlib/SRC/ztrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zunbdb1.f b/lapack-netlib/SRC/zunbdb1.f
index 767788a74..2fae170de 100644
--- a/lapack-netlib/SRC/zunbdb1.f
+++ b/lapack-netlib/SRC/zunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f
index 46b08aa1e..28e78fc23 100644
--- a/lapack-netlib/SRC/zunbdb2.f
+++ b/lapack-netlib/SRC/zunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb3.f b/lapack-netlib/SRC/zunbdb3.f
index 74083e41a..9f32a7a88 100644
--- a/lapack-netlib/SRC/zunbdb3.f
+++ b/lapack-netlib/SRC/zunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f
index 4672cfa67..a1db5eb79 100644
--- a/lapack-netlib/SRC/zunbdb4.f
+++ b/lapack-netlib/SRC/zunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zuncsd2by1.f b/lapack-netlib/SRC/zuncsd2by1.f
index 2d861c2cc..399b598be 100644
--- a/lapack-netlib/SRC/zuncsd2by1.f
+++ b/lapack-netlib/SRC/zuncsd2by1.f
@@ -211,13 +211,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f
index 4f2d7dfdd..23e28ac5c 100644
--- a/lapack-netlib/SRC/zungtsqr.f
+++ b/lapack-netlib/SRC/zungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of ZUNGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f
index 95747d051..ed535e91f 100644
--- a/lapack-netlib/TESTING/EIG/cchkst.f
+++ b/lapack-netlib/TESTING/EIG/cchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -665,8 +665,7 @@
       EXTERNAL           CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
      $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
      $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, CUPGTR,
-     $                   SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA
+     $                   SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, CONJG, INT, LOG, MAX, MIN, REAL, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f
index e4deb8ac8..668b51a36 100644
--- a/lapack-netlib/TESTING/EIG/cchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,10 +683,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA, CCOPY, CHET21, CHETRD, CHPT21, CHPTRD,
-     $                   CLACPY, CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC,
-     $                   CSTEMR, CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
+      EXTERNAL           SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA,
+     $                   CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
+     $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
+     $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
      $                   CUPGTR, CHETRD_2STAGE, SLASET
 *     ..
 *     .. Intrinsic Functions ..
@@ -754,7 +754,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f
index c5baeef5e..1729770a2 100644
--- a/lapack-netlib/TESTING/EIG/cdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgsx.f
@@ -395,7 +395,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CGESVD, CGET51, CGGESX, CLACPY, CLAKF2,
-     $                   CLASET, CLATM5, SLABAD, XERBLA
+     $                   CLASET, CLATM5, XERBLA
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            FS
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -917,7 +916,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/cdrgvx.f b/lapack-netlib/TESTING/EIG/cdrgvx.f
index 830a39d86..4114b697f 100644
--- a/lapack-netlib/TESTING/EIG/cdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f
index d15b39d01..729976738 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -420,7 +420,7 @@
 *     .. External Subroutines ..
       EXTERNAL           CHBGV, CHBGVD, CHBGVX, CHEGV, CHEGVD, CHEGVX,
      $                   CHPGV, CHPGVD, CHPGVX, CLACPY, CLASET, CLATMR,
-     $                   CLATMS, CSGT01, SLABAD, SLAFTS, SLASUM, XERBLA
+     $                   CLATMS, CSGT01, SLAFTS, SLASUM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, REAL, SQRT
@@ -481,7 +481,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
index 8b8553773..f7d323247 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
+      EXTERNAL           SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
      $                   CHBGVX, CHEGV, CHEGVD, CHEGVX, CHPGV, CHPGVD,
      $                   CHPGVX, CLACPY, CLASET, CLATMR, CLATMS, CSGT01,
      $                   CHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst.f b/lapack-netlib/TESTING/EIG/cdrvst.f
index 9c129c0e8..205f06f0d 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -393,8 +393,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CHBEV, CHBEVD, CHBEVX, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHET21, CHET22, CHPEV, CHPEVD,
-     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLABAD,
-     $                   SLAFTS, XERBLA
+     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLAFTS,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst2stg.f b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
index 954c7fb87..258f1f370 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLAFTS, XERBLA, CHBEV, CHBEVD,
+      EXTERNAL           ALASVM, SLAFTS, XERBLA, CHBEV, CHBEVD,
      $                   CHBEVX, CHEEV, CHEEVD, CHEEVR, CHEEVX, CHET21,
      $                   CHET22, CHPEV, CHPEVD, CHPEVX, CLACPY, CLASET,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cerrhs.f b/lapack-netlib/TESTING/EIG/cerrhs.f
index 0568a6d78..2dd86b8c9 100644
--- a/lapack-netlib/TESTING/EIG/cerrhs.f
+++ b/lapack-netlib/TESTING/EIG/cerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CUNGHR,
-*> CUNMHR, CHSEQR, CHSEIN, and CTREVC.
+*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CGEHD2,
+*> CUNGHR, CUNMHR, CHSEQR, CHSEIN, CTREVC, and CTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, CGEBAK, CGEBAL, CGEHRD, CHSEIN, CHSEQR,
-     $                   CUNGHR, CUNMHR, CTREVC
+     $                   CUNGHR, CUNMHR, CTREVC, CTREVC3, CGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'CGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        CGEHD2
+*
+         SRNAMT = 'CGEHD2'
+         INFOT = 1
+         CALL CGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        CUNGHR
 *
          SRNAMT = 'CUNGHR'
@@ -398,6 +421,47 @@
      $                RW, INFO )
          CALL CHKXER( 'CTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        CTREVC3
+*
+         SRNAMT = 'CTREVC3'
+         INFOT = 1
+         CALL CTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index eef34b44b..1748a2aad 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> CERRST tests the error exits for CHETRD, CUNGTR, CUNMTR, CHPTRD,
+*> CERRST tests the error exits for CHETRD, CHETD2, CUNGTR, CUNMTR, CHPTRD,
 *> CUNGTR, CUPMTR, CSTEQR, CSTEIN, CPTEQR, CHBTRD,
 *> CHEEV, CHEEVX, CHEEVD, CHBEV, CHBEVX, CHBEVD,
 *> CHPEV, CHPEVX, CHPEVD, and CSTEDC.
@@ -94,7 +94,7 @@
       EXTERNAL           CHBEV, CHBEVD, CHBEVX, CHBTRD, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHETRD, CHKXER, CHPEV, CHPEVD,
      $                   CHPEVX, CHPTRD, CPTEQR, CSTEDC, CSTEIN, CSTEQR,
-     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR,
+     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR, CHETD2,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
      $                   CHEEV_2STAGE, CHBEV_2STAGE, CHBEVD_2STAGE,
      $                   CHBEVX_2STAGE, CHETRD_2STAGE, CHETRD_HE2HB,
@@ -156,6 +156,20 @@
          CALL CHKXER( 'CHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        CHETD2
+*
+         SRNAMT = 'CHETD2'
+         INFOT = 1
+         CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        CHETRD_2STAGE
 *
          SRNAMT = 'CHETRD_2STAGE'
@@ -628,56 +642,56 @@
          SRNAMT = 'CHEEVX_2STAGE'
          INFOT = 1
          CALL CHEEVX_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVX_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVX_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                0.0, 1.0, 1, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVX_2STAGE( 'N', 'A', '/', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVX_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 0, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 17
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 0, RW, IW, I1, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 11
@@ -755,79 +769,79 @@
          N = 1
          INFOT = 1
          CALL CHEEVR_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVR_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVR_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVR_2STAGE( 'N', 'A', '/', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 4
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVR_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 1, 0.0D0,
+     $                0.0, 0.0, 0, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 0, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
      $                INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
@@ -1259,65 +1273,65 @@
          SRNAMT = 'CHBEVX_2STAGE'
          INFOT = 1
          CALL CHBEVX_2STAGE( '/', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 1
          CALL CHBEVX_2STAGE( 'V', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHBEVX_2STAGE( 'N', '/', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                       0.0, 1.0, 1, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHBEVX_2STAGE( 'N', 'A', '/', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', -1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 0, -1, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 1, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
 *         INFOT = 9
 *         CALL CHBEVX_2STAGE( 'V', 'A', 'U', 2, 0, A, 1, Q, 1,
-*     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+*     $                       0.0, 0.0, 0, 0, 0.0,
 *     $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
 *         CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHBEVX_2STAGE( 'N', 'V', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 12
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 13
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 1, 2, 0.0D0,
+     $                       0.0, 0.0, 1, 2, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 0, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 12
diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f
index 82d790daa..8a3cd1ae5 100644
--- a/lapack-netlib/TESTING/EIG/csyl01.f
+++ b/lapack-netlib/TESTING/EIG/csyl01.f
@@ -120,14 +120,16 @@
       COMPLEX            RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX            A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX            DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      REAL               SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 )
+      REAL               DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      REAL,    DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
       REAL               SLAMCH, CLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f
index 2e04f68c5..6e02c84fb 100644
--- a/lapack-netlib/TESTING/EIG/dchkst.f
+++ b/lapack-netlib/TESTING/EIG/dchkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f
index 2c98b802d..0fbebc8b7 100644
--- a/lapack-netlib/TESTING/EIG/dchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
      $                   DSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f
index b3f5e23f4..82470ead3 100644
--- a/lapack-netlib/TESTING/EIG/ddrgsx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           DLCTSX, ILAENV, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX, DLABAD,
+      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX,
      $                   DLACPY, DLAKF2, DLASET, DLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -984,7 +983,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/ddrgvx.f b/lapack-netlib/TESTING/EIG/ddrgvx.f
index c63762134..3de72eb70 100644
--- a/lapack-netlib/TESTING/EIG/ddrgvx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f
index 2e9d3c643..72c373086 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
index 196c6b48e..0fe31cab1 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA,
      $                   DSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst.f b/lapack-netlib/TESTING/EIG/ddrvst.f
index a25077018..805fd8271 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
-     $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
-     $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
-     $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
-     $                   DSYT22, XERBLA
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, DLATMS,
+     $                   DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, DSPEVX,
+     $                   DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, DSTT22,
+     $                   DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, DSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst2stg.f b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
index c9a2632bb..e38671e38 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR,
      $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
      $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
      $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/derrhs.f b/lapack-netlib/TESTING/EIG/derrhs.f
index fec41c0f0..583bebc9b 100644
--- a/lapack-netlib/TESTING/EIG/derrhs.f
+++ b/lapack-netlib/TESTING/EIG/derrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DERRHS tests the error exits for DGEBAK, SGEBAL, SGEHRD, DORGHR,
-*> DORMHR, DHSEQR, SHSEIN, and DTREVC.
+*> DERRHS tests the error exits for DGEBAK, DGEBAL, DGEHRD, DGEHD2,
+*> DORGHR, DORMHR, DHSEQR, DHSEIN, DTREVC, and DTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGEBAK, DGEBAL, DGEHRD, DHSEIN, DHSEQR,
-     $                   DORGHR, DORMHR, DTREVC
+     $                   DORGHR, DORMHR, DTREVC, DTREVC3, DGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -194,6 +194,29 @@
          CALL CHKXER( 'DGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        DGEHD2
+*
+         SRNAMT = 'DGEHD2'
+         INFOT = 1
+         CALL DGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        DORGHR
 *
          SRNAMT = 'DORGHR'
@@ -328,7 +351,11 @@
          CALL DHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL DHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        DHSEIN
 *
@@ -399,6 +426,43 @@
      $                INFO )
          CALL CHKXER( 'DTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        DTREVC3
+*
+         SRNAMT = 'DTREVC3'
+         INFOT = 1
+         CALL DTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index f297e5a7d..059538644 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -21,10 +21,10 @@
 *>
 *> \verbatim
 *>
-*> DERRST tests the error exits for DSYTRD, DORGTR, DORMTR, DSPTRD,
-*> DOPGTR, DOPMTR, DSTEQR, SSTERF, SSTEBZ, SSTEIN, DPTEQR, DSBTRD,
-*> DSYEV, SSYEVX, SSYEVD, DSBEV, SSBEVX, SSBEVD,
-*> DSPEV, SSPEVX, SSPEVD, DSTEV, SSTEVX, SSTEVD, and SSTEDC.
+*> DERRST tests the error exits for DSYTRD, DSYTD2, DORGTR, DORMTR, DSPTRD,
+*> DOPGTR, DOPMTR, DSTEQR, DSTERF, DSTEBZ, DSTEIN, DPTEQR, DSBTRD,
+*> DSYEV, DSYEVX, DSYEVD, DSBEV, DSBEVX, DSBEVD,
+*> DSPEV, DSPEVX, DSPEVD, DSTEV, DSTEVX, DSTEVD, and DSTEDC.
 *> DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
 *> DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
 *> DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -95,7 +95,7 @@
      $                   DSBEV, DSBEVD, DSBEVX, DSBTRD, DSPEV, DSPEVD,
      $                   DSPEVX, DSPTRD, DSTEBZ, DSTEDC, DSTEIN, DSTEQR,
      $                   DSTERF, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSYEV,
-     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD,
+     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD, DSYTD2,
      $                   DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
      $                   DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
      $                   DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'DSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        DSYTD2
+*
+         SRNAMT = 'DSYTD2'
+         INFOT = 1
+         CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        DSYTRD_2STAGE
 *
          SRNAMT = 'DSYTRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f
index 782d2cd42..0ea481382 100644
--- a/lapack-netlib/TESTING/EIG/dsyl01.f
+++ b/lapack-netlib/TESTING/EIG/dsyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      DOUBLE PRECISION   A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      DOUBLE PRECISION   DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 126 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 126 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f
index 10622d77a..aecbdfe93 100644
--- a/lapack-netlib/TESTING/EIG/schkst.f
+++ b/lapack-netlib/TESTING/EIG/schkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f
index ac5a3fc39..49899a660 100644
--- a/lapack-netlib/TESTING/EIG/schkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/schkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
      $                   SSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f
index ce0d59214..2015cac1c 100644
--- a/lapack-netlib/TESTING/EIG/sdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           SLCTSX, ILAENV, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX, SLABAD,
+      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX,
      $                   SLACPY, SLAKF2, SLASET, SLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -479,7 +479,6 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -985,7 +984,7 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/sdrgvx.f b/lapack-netlib/TESTING/EIG/sdrgvx.f
index f6da720ad..91803f60a 100644
--- a/lapack-netlib/TESTING/EIG/sdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgvx.f
@@ -57,7 +57,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f
index 877579bcd..0c82b6f49 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       REAL array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
index ebd169977..38ed61628 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       REAL             array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA,
      $                   SSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst.f b/lapack-netlib/TESTING/EIG/sdrvst.f
index ea0cf66f9..be6d33cee 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
-     $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
-     $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
-     $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
-     $                   SSYT22, XERBLA
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, SLATMS,
+     $                   SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, SSPEVX,
+     $                   SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, SSTT22,
+     $                   SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, SSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst2stg.f b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
index a13a58b48..e05ec0749 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR,
      $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
      $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
      $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/serrhs.f b/lapack-netlib/TESTING/EIG/serrhs.f
index 8f0ff98a1..89b7303cd 100644
--- a/lapack-netlib/TESTING/EIG/serrhs.f
+++ b/lapack-netlib/TESTING/EIG/serrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SORGHR,
-*> SORMHR, SHSEQR, SHSEIN, and STREVC.
+*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SGEHD2,
+*> SORGHR, SORMHR, SHSEQR, SHSEIN, STREVC, and STREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -85,7 +85,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, SGEBAK, SGEBAL, SGEHRD, SHSEIN, SHSEQR,
-     $                   SORGHR, SORMHR, STREVC
+     $                   SORGHR, SORMHR, STREVC, STREVC3, SGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'SGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        SGEHD2
+*
+         SRNAMT = 'SGEHD2'
+         INFOT = 1
+         CALL SGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        SORGHR
 *
          SRNAMT = 'SORGHR'
@@ -327,7 +350,11 @@
          CALL SHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL SHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        SHSEIN
 *
@@ -398,6 +425,43 @@
      $                INFO )
          CALL CHKXER( 'STREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        STREVC3
+*
+         SRNAMT = 'STREVC3'
+         INFOT = 1
+         CALL STREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL STREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL STREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL STREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index afb6d4faf..b87fc42ef 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> SERRST tests the error exits for SSYTRD, SORGTR, SORMTR, SSPTRD,
+*> SERRST tests the error exits for SSYTRD, SSYTD2, SORGTR, SORMTR, SSPTRD,
 *> SOPGTR, SOPMTR, SSTEQR, SSTERF, SSTEBZ, SSTEIN, SPTEQR, SSBTRD,
 *> SSYEV, SSYEVX, SSYEVD, SSBEV, SSBEVX, SSBEVD,
 *> SSPEV, SSPEVX, SSPEVD, SSTEV, SSTEVX, SSTEVD, and SSTEDC.
@@ -95,7 +95,7 @@
      $                   SSBEV, SSBEVD, SSBEVX, SSBTRD, SSPEV, SSPEVD,
      $                   SSPEVX, SSPTRD, SSTEBZ, SSTEDC, SSTEIN, SSTEQR,
      $                   SSTERF, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSYEV,
-     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD,
+     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD, SSYTD2,
      $                   SSYEVD_2STAGE, SSYEVR_2STAGE, SSYEVX_2STAGE,
      $                   SSYEV_2STAGE, SSBEV_2STAGE, SSBEVD_2STAGE,
      $                   SSBEVX_2STAGE, SSYTRD_2STAGE, SSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'SSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        SSYTD2
+*
+         SRNAMT = 'SSYTD2'
+         INFOT = 1
+         CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        SSYTRD_2STAGE
 *
          SRNAMT = 'SSYTRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f
index 22d089dc8..fda30a3c0 100644
--- a/lapack-netlib/TESTING/EIG/ssyl01.f
+++ b/lapack-netlib/TESTING/EIG/ssyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      REAL               A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      REAL               DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 54 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f
index 60496dde1..b77d94438 100644
--- a/lapack-netlib/TESTING/EIG/zchkst.f
+++ b/lapack-netlib/TESTING/EIG/zchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -662,11 +662,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f
index b1ef80816..3b333ef1c 100644
--- a/lapack-netlib/TESTING/EIG/zchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,11 +683,11 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR, ZHETRD_2STAGE, DLASET
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR,
+     $                   ZHETRD_2STAGE, DLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -754,7 +754,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f
index 096ed3c22..a486873fa 100644
--- a/lapack-netlib/TESTING/EIG/zdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgsx.f
@@ -394,7 +394,7 @@
       EXTERNAL           ZLCTSX, ILAENV, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, XERBLA, ZGESVD, ZGET51, ZGGESX,
+      EXTERNAL           ALASVM, XERBLA, ZGESVD, ZGET51, ZGGESX,
      $                   ZLACPY, ZLAKF2, ZLASET, ZLATM5
 *     ..
 *     .. Scalars in Common ..
@@ -479,7 +479,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -918,7 +917,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/zdrgvx.f b/lapack-netlib/TESTING/EIG/zdrgvx.f
index 72e0303fb..813151f13 100644
--- a/lapack-netlib/TESTING/EIG/zdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgvx.f
@@ -55,7 +55,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f
index 71f1d6371..1796805ea 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -418,7 +418,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01
 *     ..
@@ -481,7 +481,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
index 4bdf2849e..c5ef4ce70 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01,
      $                   ZHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst.f b/lapack-netlib/TESTING/EIG/zdrvst.f
index 384e58de1..f838f0af2 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,10 +391,10 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
-     $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
-     $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
-     $                   ZLATMR, ZLATMS
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, ZHBEVX,
+     $                   ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, ZHET22,
+     $                   ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, ZLATMR,
+     $                   ZLATMS
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst2stg.f b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
index 4a88e5218..4b989b460 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
      $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
      $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zerrhs.f b/lapack-netlib/TESTING/EIG/zerrhs.f
index 582338947..3a70a556e 100644
--- a/lapack-netlib/TESTING/EIG/zerrhs.f
+++ b/lapack-netlib/TESTING/EIG/zerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZERRHS tests the error exits for ZGEBAK, CGEBAL, CGEHRD, ZUNGHR,
-*> ZUNMHR, ZHSEQR, CHSEIN, and ZTREVC.
+*> ZERRHS tests the error exits for ZGEBAK, ZGEBAL, ZGEHRD, ZGEHD2,
+*> ZUNGHR, ZUNMHR, ZHSEQR, ZHSEIN, ZTREVC, and ZTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGEBAK, ZGEBAL, ZGEHRD, ZHSEIN, ZHSEQR,
-     $                   ZTREVC, ZUNGHR, ZUNMHR
+     $                   ZUNGHR, ZUNMHR, ZTREVC, ZTREVC3
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -193,6 +193,29 @@
          CALL CHKXER( 'ZGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        ZGEHD2
+*
+         SRNAMT = 'ZGEHD2'
+         INFOT = 1
+         CALL ZGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        ZUNGHR
 *
          SRNAMT = 'ZUNGHR'
@@ -389,6 +412,47 @@
      $                INFO )
          CALL CHKXER( 'ZTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        ZTREVC3
+*
+         SRNAMT = 'ZTREVC3'
+         INFOT = 1
+         CALL ZTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index 5b0e6f820..d7b41c053 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -23,7 +23,7 @@
 *>
 *> \verbatim
 *>
-*> ZERRST tests the error exits for ZHETRD, ZUNGTR, CUNMTR, ZHPTRD,
+*> ZERRST tests the error exits for ZHETRD, ZHETD2, ZUNGTR, CUNMTR, ZHPTRD,
 *> ZUNGTR, ZUPMTR, ZSTEQR, CSTEIN, ZPTEQR, ZHBTRD,
 *> ZHEEV, CHEEVX, CHEEVD, ZHBEV, CHBEVX, CHBEVD,
 *> ZHPEV, CHPEVX, CHPEVD, and ZSTEDC.
@@ -95,7 +95,7 @@
       EXTERNAL           CHKXER, ZHBEV, ZHBEVD, ZHBEVX, ZHBTRD, ZHEEV,
      $                   ZHEEVD, ZHEEVR, ZHEEVX, ZHETRD, ZHPEV, ZHPEVD,
      $                   ZHPEVX, ZHPTRD, ZPTEQR, ZSTEDC, ZSTEIN, ZSTEQR,
-     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR,
+     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, ZHETD2,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
      $                   ZHEEV_2STAGE, ZHBEV_2STAGE, ZHBEVD_2STAGE,
      $                   ZHBEVX_2STAGE, ZHETRD_2STAGE
@@ -156,6 +156,20 @@
          CALL CHKXER( 'ZHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        ZHETD2
+*
+         SRNAMT = 'ZHETD2'
+         INFOT = 1
+         CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        ZHETRD_2STAGE
 *
          SRNAMT = 'ZHETRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f
index 329f39dc4..5d26d494c 100644
--- a/lapack-netlib/TESTING/EIG/zsyl01.f
+++ b/lapack-netlib/TESTING/EIG/zsyl01.f
@@ -120,14 +120,16 @@
       COMPLEX*16         RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX*16         A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX*16         DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      DOUBLE PRECISION   SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 )
+      DOUBLE PRECISION   DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX*16,       DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
       DOUBLE PRECISION   DLAMCH, ZLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX, SQRT
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 103 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index f0423a23b..dd75394b3 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -777,7 +777,7 @@
      $       'triangular-pentagonal matrices' )
  8004 FORMAT( / 1X, A3, ':  TS factorization for ',
      $       'tall-skinny or short-wide matrices' )
- 8005 FORMAT( / 1X, A3, ':  Householder recostruction from TSQR',
+ 8005 FORMAT( / 1X, A3, ':  Householder reconstruction from TSQR',
      $       ' factorization output ', /,' for tall-skinny matrices.' )
 *
 *     GE matrix types
diff --git a/lapack-netlib/TESTING/LIN/cchktp.f b/lapack-netlib/TESTING/LIN/cchktp.f
index 18242ff54..e14f1062e 100644
--- a/lapack-netlib/TESTING/LIN/cchktp.f
+++ b/lapack-netlib/TESTING/LIN/cchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index 4b09361d8..2953a2bd5 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS ), SCALE3( 2 )
+      REAL               RESULT( NTESTS ), RWORK2( 2*NMAX ),
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -542,10 +543,10 @@
                   SRNAMT = 'CLATRS3'
                   CALL CCOPY( N, X, 1, B, 1 )
                   CALL CCOPY( N, X, 1, B( N+1 ), 1 )
-                  CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from CLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/cerrhe.f b/lapack-netlib/TESTING/LIN/cerrhe.f
index d0c5cf6b5..1798aed57 100644
--- a/lapack-netlib/TESTING/LIN/cerrhe.f
+++ b/lapack-netlib/TESTING/LIN/cerrhe.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -576,7 +576,7 @@
          CALL CHKXER( 'CHETRS_AA_STAGE', INFOT, NOUT, LERR, OK )
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/cerrhex.f b/lapack-netlib/TESTING/LIN/cerrhex.f
index b6c889798..83b3a92ad 100644
--- a/lapack-netlib/TESTING/LIN/cerrhex.f
+++ b/lapack-netlib/TESTING/LIN/cerrhex.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -523,7 +523,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite packed matrix with patrial
+*     of a Hermitian indefinite packed matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrsy.f b/lapack-netlib/TESTING/LIN/cerrsy.f
index a236d2d18..6d284be98 100644
--- a/lapack-netlib/TESTING/LIN/cerrsy.f
+++ b/lapack-netlib/TESTING/LIN/cerrsy.f
@@ -130,7 +130,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -469,7 +469,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrsyx.f b/lapack-netlib/TESTING/LIN/cerrsyx.f
index 34972668e..c8d269c84 100644
--- a/lapack-netlib/TESTING/LIN/cerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/cerrsyx.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -521,7 +521,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f
index 9ba784f62..ab83357f8 100644
--- a/lapack-netlib/TESTING/LIN/cerrtr.f
+++ b/lapack-netlib/TESTING/LIN/cerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       REAL               R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'CLATRS3'
          INFOT = 1
-         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.
diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f
index 701abd161..548d09d0c 100644
--- a/lapack-netlib/TESTING/LIN/cerrvx.f
+++ b/lapack-netlib/TESTING/LIN/cerrvx.f
@@ -87,7 +87,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGBSV, CGBSVX, CGESV, CGESVX, CGTSV, CGTSVX,
-     $                   CHESV, CHESV_RK ,CHESV_ROOK, CHESVX, CHKXER,
+     $                   CHESV, CHESV_RK, CHESV_ROOK, CHESVX, CHKXER,
      $                   CHPSV, CHPSVX, CPBSV, CPBSVX, CPOSV, CPOSVX,
      $                   CPPSV, CPPSVX, CPTSV, CPTSVX, CSPSV, CSPSVX,
      $                   CSYSV, CSYSV_AA, CSYSV_RK, CSYSV_ROOK,
@@ -651,6 +651,9 @@
          INFOT = 3
          CALL CHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -710,9 +713,15 @@
          INFOT = 3
          CALL CHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -733,16 +742,44 @@
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        CSYSV_AASEN
+*
+         SRNAMT = 'CSYSV_AA'
+         INFOT = 1
+         CALL CSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -763,14 +800,18 @@
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
@@ -834,6 +875,9 @@
          INFOT = 3
          CALL CSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
@@ -898,6 +942,9 @@
          INFOT = 3
          CALL CSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
diff --git a/lapack-netlib/TESTING/LIN/cgtt01.f b/lapack-netlib/TESTING/LIN/cgtt01.f
index e504515c5..64dc5f8fa 100644
--- a/lapack-netlib/TESTING/LIN/cgtt01.f
+++ b/lapack-netlib/TESTING/LIN/cgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cgtt02.f b/lapack-netlib/TESTING/LIN/cgtt02.f
index 702e66eed..8b951acd5 100644
--- a/lapack-netlib/TESTING/LIN/cgtt02.f
+++ b/lapack-netlib/TESTING/LIN/cgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/chet01_3.f b/lapack-netlib/TESTING/LIN/chet01_3.f
index 58104a357..7e07b5790 100644
--- a/lapack-netlib/TESTING/LIN/chet01_3.f
+++ b/lapack-netlib/TESTING/LIN/chet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/clqt02.f b/lapack-netlib/TESTING/LIN/clqt02.f
index 24eb05399..20297f6cc 100644
--- a/lapack-netlib/TESTING/LIN/clqt02.f
+++ b/lapack-netlib/TESTING/LIN/clqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CLQT02 tests CUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, CLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cptt01.f b/lapack-netlib/TESTING/LIN/cptt01.f
index e4520ec3d..3b117ad28 100644
--- a/lapack-netlib/TESTING/LIN/cptt01.f
+++ b/lapack-netlib/TESTING/LIN/cptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cptt02.f b/lapack-netlib/TESTING/LIN/cptt02.f
index da4f0e854..ffaef89bd 100644
--- a/lapack-netlib/TESTING/LIN/cptt02.f
+++ b/lapack-netlib/TESTING/LIN/cptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cqlt02.f b/lapack-netlib/TESTING/LIN/cqlt02.f
index fc4685aa5..53080cd17 100644
--- a/lapack-netlib/TESTING/LIN/cqlt02.f
+++ b/lapack-netlib/TESTING/LIN/cqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQLT02 tests CUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, CQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cqrt02.f b/lapack-netlib/TESTING/LIN/cqrt02.f
index 62f176cd8..db22496cf 100644
--- a/lapack-netlib/TESTING/LIN/cqrt02.f
+++ b/lapack-netlib/TESTING/LIN/cqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQRT02 tests CUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, CQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/crqt02.f b/lapack-netlib/TESTING/LIN/crqt02.f
index 8625c2f91..81c805dc4 100644
--- a/lapack-netlib/TESTING/LIN/crqt02.f
+++ b/lapack-netlib/TESTING/LIN/crqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CRQT02 tests CUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, CRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/crzt01.f b/lapack-netlib/TESTING/LIN/crzt01.f
index 6a75a8b31..94fdcb4a9 100644
--- a/lapack-netlib/TESTING/LIN/crzt01.f
+++ b/lapack-netlib/TESTING/LIN/crzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL CUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL CUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/csyt01_3.f b/lapack-netlib/TESTING/LIN/csyt01_3.f
index f0b316cef..879749a10 100644
--- a/lapack-netlib/TESTING/LIN/csyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/csyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchktp.f b/lapack-netlib/TESTING/LIN/dchktp.f
index 9af6150ca..6db5b1376 100644
--- a/lapack-netlib/TESTING/LIN/dchktp.f
+++ b/lapack-netlib/TESTING/LIN/dchktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/ddrvab.f b/lapack-netlib/TESTING/LIN/ddrvab.f
index 9110d8334..5fd32b951 100644
--- a/lapack-netlib/TESTING/LIN/ddrvab.f
+++ b/lapack-netlib/TESTING/LIN/ddrvab.f
@@ -346,7 +346,7 @@
                CALL DGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/ddrvac.f b/lapack-netlib/TESTING/LIN/ddrvac.f
index bd463cee4..3ecbc6a23 100644
--- a/lapack-netlib/TESTING/LIN/ddrvac.f
+++ b/lapack-netlib/TESTING/LIN/ddrvac.f
@@ -365,7 +365,7 @@
                   CALL DPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/derrsy.f b/lapack-netlib/TESTING/LIN/derrsy.f
index af15a4b8e..eb08d4c7b 100644
--- a/lapack-netlib/TESTING/LIN/derrsy.f
+++ b/lapack-netlib/TESTING/LIN/derrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/derrsyx.f b/lapack-netlib/TESTING/LIN/derrsyx.f
index c2d14caab..495302158 100644
--- a/lapack-netlib/TESTING/LIN/derrsyx.f
+++ b/lapack-netlib/TESTING/LIN/derrsyx.f
@@ -138,7 +138,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -528,7 +528,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f
index d0580497d..878d9070c 100644
--- a/lapack-netlib/TESTING/LIN/derrtr.f
+++ b/lapack-netlib/TESTING/LIN/derrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'DLATRS3'
          INFOT = 1
-         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f
index 447b00bc4..f2d29f7a3 100644
--- a/lapack-netlib/TESTING/LIN/derrvx.f
+++ b/lapack-netlib/TESTING/LIN/derrvx.f
@@ -699,21 +699,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        DSYSV_AA
-*
-        SRNAMT = 'DSYSV_AA'
-        INFOT = 1
-        CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+*        DSYSV_AASEN
+*
+         SRNAMT = 'DSYSV_AA'
+         INFOT = 1
+         CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -734,14 +740,18 @@
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/dgtt01.f b/lapack-netlib/TESTING/LIN/dgtt01.f
index 3b5ff9e4c..6fd754db2 100644
--- a/lapack-netlib/TESTING/LIN/dgtt01.f
+++ b/lapack-netlib/TESTING/LIN/dgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dgtt02.f b/lapack-netlib/TESTING/LIN/dgtt02.f
index b3268b138..4fecff86e 100644
--- a/lapack-netlib/TESTING/LIN/dgtt02.f
+++ b/lapack-netlib/TESTING/LIN/dgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dlqt02.f b/lapack-netlib/TESTING/LIN/dlqt02.f
index 158cd0b79..672d82bbc 100644
--- a/lapack-netlib/TESTING/LIN/dlqt02.f
+++ b/lapack-netlib/TESTING/LIN/dlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DLQT02 tests DORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, DLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dptt01.f b/lapack-netlib/TESTING/LIN/dptt01.f
index 2c6c440ff..9e9c6bd85 100644
--- a/lapack-netlib/TESTING/LIN/dptt01.f
+++ b/lapack-netlib/TESTING/LIN/dptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dptt02.f b/lapack-netlib/TESTING/LIN/dptt02.f
index a4802c696..e045b8779 100644
--- a/lapack-netlib/TESTING/LIN/dptt02.f
+++ b/lapack-netlib/TESTING/LIN/dptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dqlt02.f b/lapack-netlib/TESTING/LIN/dqlt02.f
index 950cfe67b..7799e0858 100644
--- a/lapack-netlib/TESTING/LIN/dqlt02.f
+++ b/lapack-netlib/TESTING/LIN/dqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQLT02 tests DORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, DQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dqrt02.f b/lapack-netlib/TESTING/LIN/dqrt02.f
index d0e4349d1..d41c7b324 100644
--- a/lapack-netlib/TESTING/LIN/dqrt02.f
+++ b/lapack-netlib/TESTING/LIN/dqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQRT02 tests DORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, DQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/drqt02.f b/lapack-netlib/TESTING/LIN/drqt02.f
index 7fbb6a6d4..faf639109 100644
--- a/lapack-netlib/TESTING/LIN/drqt02.f
+++ b/lapack-netlib/TESTING/LIN/drqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DRQT02 tests DORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, DRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/drzt01.f b/lapack-netlib/TESTING/LIN/drzt01.f
index 8e969aba7..7a88a2c20 100644
--- a/lapack-netlib/TESTING/LIN/drzt01.f
+++ b/lapack-netlib/TESTING/LIN/drzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL DORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL DORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/dsyt01_3.f b/lapack-netlib/TESTING/LIN/dsyt01_3.f
index 2a7d9d142..060a9caed 100644
--- a/lapack-netlib/TESTING/LIN/dsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/dsyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL DSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/schktp.f b/lapack-netlib/TESTING/LIN/schktp.f
index ff05c1d97..a5243f651 100644
--- a/lapack-netlib/TESTING/LIN/schktp.f
+++ b/lapack-netlib/TESTING/LIN/schktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/serrsy.f b/lapack-netlib/TESTING/LIN/serrsy.f
index c562b417b..ab422ba5c 100644
--- a/lapack-netlib/TESTING/LIN/serrsy.f
+++ b/lapack-netlib/TESTING/LIN/serrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/serrsyx.f b/lapack-netlib/TESTING/LIN/serrsyx.f
index ed47c37bd..cffd28a52 100644
--- a/lapack-netlib/TESTING/LIN/serrsyx.f
+++ b/lapack-netlib/TESTING/LIN/serrsyx.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -527,7 +527,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f
index af1ce0a8e..391b54c3f 100644
--- a/lapack-netlib/TESTING/LIN/serrtr.f
+++ b/lapack-netlib/TESTING/LIN/serrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'SLATRS3'
          INFOT = 1
-         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f
index c3db47332..440f9113e 100644
--- a/lapack-netlib/TESTING/LIN/serrvx.f
+++ b/lapack-netlib/TESTING/LIN/serrvx.f
@@ -582,6 +582,9 @@
          INFOT = 3
          CALL SSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
@@ -647,6 +650,9 @@
          INFOT = 3
          CALL SSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
@@ -694,7 +700,7 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        SSYSV_AA
+*        SSYSV_AASEN
 *
          SRNAMT = 'SSYSV_AA'
          INFOT = 1
@@ -706,13 +712,19 @@
          INFOT = 3
          CALL SSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
-*        DSYSV_AASEN_2STAGE
+*        SSYSV_AASEN_2STAGE
 *
          SRNAMT = 'SSYSV_AA_2STAGE'
          INFOT = 1
@@ -729,14 +741,18 @@
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/sgtt01.f b/lapack-netlib/TESTING/LIN/sgtt01.f
index 5d88c91f7..5d639af99 100644
--- a/lapack-netlib/TESTING/LIN/sgtt01.f
+++ b/lapack-netlib/TESTING/LIN/sgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sgtt02.f b/lapack-netlib/TESTING/LIN/sgtt02.f
index a4eb09f6b..6daea295f 100644
--- a/lapack-netlib/TESTING/LIN/sgtt02.f
+++ b/lapack-netlib/TESTING/LIN/sgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/slqt02.f b/lapack-netlib/TESTING/LIN/slqt02.f
index d59a8a01c..1c7bd4f0d 100644
--- a/lapack-netlib/TESTING/LIN/slqt02.f
+++ b/lapack-netlib/TESTING/LIN/slqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SLQT02 tests SORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, SLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sptt01.f b/lapack-netlib/TESTING/LIN/sptt01.f
index fef18dd2b..5b33bfad5 100644
--- a/lapack-netlib/TESTING/LIN/sptt01.f
+++ b/lapack-netlib/TESTING/LIN/sptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sptt02.f b/lapack-netlib/TESTING/LIN/sptt02.f
index 51a6a074a..2bd5535aa 100644
--- a/lapack-netlib/TESTING/LIN/sptt02.f
+++ b/lapack-netlib/TESTING/LIN/sptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sqlt02.f b/lapack-netlib/TESTING/LIN/sqlt02.f
index 5d381b950..f26352eb3 100644
--- a/lapack-netlib/TESTING/LIN/sqlt02.f
+++ b/lapack-netlib/TESTING/LIN/sqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQLT02 tests SORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, SQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sqrt02.f b/lapack-netlib/TESTING/LIN/sqrt02.f
index 72163f0a9..44b9c6270 100644
--- a/lapack-netlib/TESTING/LIN/sqrt02.f
+++ b/lapack-netlib/TESTING/LIN/sqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQRT02 tests SORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, SQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/srqt02.f b/lapack-netlib/TESTING/LIN/srqt02.f
index ca0594f7a..a33c98ba4 100644
--- a/lapack-netlib/TESTING/LIN/srqt02.f
+++ b/lapack-netlib/TESTING/LIN/srqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SRQT02 tests SORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, SRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/srzt01.f b/lapack-netlib/TESTING/LIN/srzt01.f
index 4478c0506..a2b8b29ba 100644
--- a/lapack-netlib/TESTING/LIN/srzt01.f
+++ b/lapack-netlib/TESTING/LIN/srzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL SORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL SORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/ssyt01_3.f b/lapack-netlib/TESTING/LIN/ssyt01_3.f
index 147553db9..951fcb7d6 100644
--- a/lapack-netlib/TESTING/LIN/ssyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/ssyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL SSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchktp.f b/lapack-netlib/TESTING/LIN/zchktp.f
index 1798c24e7..ab46f5ce5 100644
--- a/lapack-netlib/TESTING/LIN/zchktp.f
+++ b/lapack-netlib/TESTING/LIN/zchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f
index 275ca2857..4af538124 100644
--- a/lapack-netlib/TESTING/LIN/zchktr.f
+++ b/lapack-netlib/TESTING/LIN/zchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RWORK2( 2*NMAX), 
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -544,8 +545,8 @@
                   CALL ZCOPY( N, X, 1, B( N+1 ), 1 )
                   CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from ZLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvab.f b/lapack-netlib/TESTING/LIN/zdrvab.f
index 130515959..772eb08af 100644
--- a/lapack-netlib/TESTING/LIN/zdrvab.f
+++ b/lapack-netlib/TESTING/LIN/zdrvab.f
@@ -348,7 +348,7 @@
                CALL ZGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvac.f b/lapack-netlib/TESTING/LIN/zdrvac.f
index 20f8eb1e5..bbf73a263 100644
--- a/lapack-netlib/TESTING/LIN/zdrvac.f
+++ b/lapack-netlib/TESTING/LIN/zdrvac.f
@@ -367,7 +367,7 @@
                   CALL ZPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
index 21497477c..5406a76bf 100644
--- a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
@@ -229,7 +229,7 @@
 *     Test path
 *
       PATH( 1: 1 ) = 'Zomplex precision'
-      PATH( 2: 3 ) = 'H2'
+      PATH( 2: 3 ) = 'S2'
 *
 *     Path to generate matrices
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrhe.f b/lapack-netlib/TESTING/LIN/zerrhe.f
index e49e5037c..40dd25d75 100644
--- a/lapack-netlib/TESTING/LIN/zerrhe.f
+++ b/lapack-netlib/TESTING/LIN/zerrhe.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHETRF
@@ -580,7 +580,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrhex.f b/lapack-netlib/TESTING/LIN/zerrhex.f
index b6e8b77ef..bdcdefff7 100644
--- a/lapack-netlib/TESTING/LIN/zerrhex.f
+++ b/lapack-netlib/TESTING/LIN/zerrhex.f
@@ -138,7 +138,7 @@
       OK = .TRUE.
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite matrix with patrial
+*     of a Hermitian indefinite matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
@@ -526,7 +526,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrsy.f b/lapack-netlib/TESTING/LIN/zerrsy.f
index a50f9d24e..932df1936 100644
--- a/lapack-netlib/TESTING/LIN/zerrsy.f
+++ b/lapack-netlib/TESTING/LIN/zerrsy.f
@@ -132,7 +132,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -471,7 +471,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrsyx.f b/lapack-netlib/TESTING/LIN/zerrsyx.f
index 23d2a5a2b..9d5e71288 100644
--- a/lapack-netlib/TESTING/LIN/zerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/zerrsyx.f
@@ -139,7 +139,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -525,7 +525,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f
index 211b92154..640c39793 100644
--- a/lapack-netlib/TESTING/LIN/zerrtr.f
+++ b/lapack-netlib/TESTING/LIN/zerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'ZLATRS3'
          INFOT = 1
-         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.
diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f
index ea7823df3..80b3aaf4a 100644
--- a/lapack-netlib/TESTING/LIN/zerrvx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvx.f
@@ -653,6 +653,9 @@
          INFOT = 3
          CALL ZHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -700,21 +703,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN
 *
-*        ZHESV_AA
-*
-        SRNAMT = 'ZHESV_AA'
-        INFOT = 1
-        CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+*        ZHESV_AASEN
+*
+         SRNAMT = 'ZHESV_AA'
+         INFOT = 1
+         CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -735,16 +744,44 @@
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        ZSYSV_AASEN
+*
+         SRNAMT = 'ZSYSV_AA'
+         INFOT = 1
+         CALL ZSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -765,17 +802,21 @@
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-**
+*
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        ZHPSV
@@ -836,6 +877,9 @@
          INFOT = 3
          CALL ZSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
@@ -900,6 +944,9 @@
          INFOT = 3
          CALL ZSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
diff --git a/lapack-netlib/TESTING/LIN/zgtt01.f b/lapack-netlib/TESTING/LIN/zgtt01.f
index c63fdbc25..a8ac13d2d 100644
--- a/lapack-netlib/TESTING/LIN/zgtt01.f
+++ b/lapack-netlib/TESTING/LIN/zgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zgtt02.f b/lapack-netlib/TESTING/LIN/zgtt02.f
index 7362967be..f86fe9244 100644
--- a/lapack-netlib/TESTING/LIN/zgtt02.f
+++ b/lapack-netlib/TESTING/LIN/zgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zhet01_3.f b/lapack-netlib/TESTING/LIN/zhet01_3.f
index 0a76404d6..5beed9042 100644
--- a/lapack-netlib/TESTING/LIN/zhet01_3.f
+++ b/lapack-netlib/TESTING/LIN/zhet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zlqt02.f b/lapack-netlib/TESTING/LIN/zlqt02.f
index c55d76ccd..9ba98ff1f 100644
--- a/lapack-netlib/TESTING/LIN/zlqt02.f
+++ b/lapack-netlib/TESTING/LIN/zlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZLQT02 tests ZUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, ZLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zptt01.f b/lapack-netlib/TESTING/LIN/zptt01.f
index e842c7e46..9f2359c2a 100644
--- a/lapack-netlib/TESTING/LIN/zptt01.f
+++ b/lapack-netlib/TESTING/LIN/zptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zptt02.f b/lapack-netlib/TESTING/LIN/zptt02.f
index 6e3a8aed0..8a1e2961c 100644
--- a/lapack-netlib/TESTING/LIN/zptt02.f
+++ b/lapack-netlib/TESTING/LIN/zptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zqlt02.f b/lapack-netlib/TESTING/LIN/zqlt02.f
index 1f84cfa5a..7cb889931 100644
--- a/lapack-netlib/TESTING/LIN/zqlt02.f
+++ b/lapack-netlib/TESTING/LIN/zqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQLT02 tests ZUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, ZQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zqrt02.f b/lapack-netlib/TESTING/LIN/zqrt02.f
index 2dbefaf84..a32703c49 100644
--- a/lapack-netlib/TESTING/LIN/zqrt02.f
+++ b/lapack-netlib/TESTING/LIN/zqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQRT02 tests ZUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, ZQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zrqt02.f b/lapack-netlib/TESTING/LIN/zrqt02.f
index 548321d00..c44e04cb6 100644
--- a/lapack-netlib/TESTING/LIN/zrqt02.f
+++ b/lapack-netlib/TESTING/LIN/zrqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZRQT02 tests ZUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, ZRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zrzt01.f b/lapack-netlib/TESTING/LIN/zrzt01.f
index 2eba7ba6f..b6e4d73f9 100644
--- a/lapack-netlib/TESTING/LIN/zrzt01.f
+++ b/lapack-netlib/TESTING/LIN/zrzt01.f
@@ -159,7 +159,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL ZUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL ZUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/zsyt01_3.f b/lapack-netlib/TESTING/LIN/zsyt01_3.f
index 202488db9..a572353f2 100644
--- a/lapack-netlib/TESTING/LIN/zsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/zsyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/cgbsvx.f b/lapack-netlib/cgbsvx.f
new file mode 100644
index 000000000..eaab5682c
--- /dev/null
+++ b/lapack-netlib/cgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> CGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by CGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by CGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGBsolve
+*
+*  =====================================================================
+      SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGB, CLANTB, SLAMCH
+      EXTERNAL           LSAME, CLANGB, CLANTB, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGBCON, CGBEQU, CGBRFS, CGBTRF, CGBTRS,
+     $                   CLACPY, CLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL CCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL CGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = CLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = CLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGBSVX
+*
+      END
diff --git a/lapack-netlib/cgejsv.f b/lapack-netlib/cgejsv.f
new file mode 100644
index 000000000..51a6cee4e
--- /dev/null
+++ b/lapack-netlib/cgejsv.f
@@ -0,0 +1,2232 @@
+*> \brief \b CGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     REAL        SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use CGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use CGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored.
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is REAL array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX array, dimension (MAX(2,LWORK))
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of 
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for CGEQP3 and CGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ)).        
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ),
+*>                            N*N+LWORK(CPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON),
+*>                        2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)).
+*>
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            4.1. if JOBV = 'V'
+*>               the minimal requirement is LWORK >= 5*N+2*N*N.
+*>            4.2. if JOBV = 'J' the minimal requirement is
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ.
+*> 
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.        
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using CPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*> 
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends
+*>          on the job:
+*> 
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and 
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  CGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>  CGEJSV implements a preconditioned Jacobi SVD algorithm. It uses CGEQP3,
+*>  CGEQRF, and CGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by CGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (CGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / SLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (CGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: CGEQP3) should be
+*>  implemented as in [3]. We have a new version of CGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in CGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of CGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX     A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+      REAL        SVA( N ), RWORK( LRWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      REAL        ZERO,         ONE
+      PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX     CZERO,                    CONE
+      PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX CTEMP
+      REAL    AAPP,   AAQQ,   AATMAX, AATMIN, BIG,    BIG1,   COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_CGELQF, LWRK_CGEQP3,  LWRK_CGEQP3N, LWRK_CGEQRF,  
+     $        LWRK_CGESVJ, LWRK_CGESVJV, LWRK_CGESVJU, LWRK_CUNMLQ, 
+     $        LWRK_CUNMQR, LWRK_CUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX CDUMMY(1)
+      REAL    RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, CMPLX, CONJG, ALOG, MAX, MIN, REAL, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      REAL      SLAMCH, SCNRM2
+      INTEGER   ISAMAX, ICAMAX
+      LOGICAL   LSAME
+      EXTERNAL  ISAMAX, ICAMAX, LSAME, SLAMCH, SCNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  SLASSQ, CCOPY,  CGELQF, CGEQP3, CGEQRF, CLACPY, CLAPMR,
+     $          CLASCL, SLASCL, CLASET, CLASSQ, CLASWP, CUNGQR, CUNMLQ,
+     $          CUNMQR, CPOCON, SSCAL,  CSSCAL, CSWAP,  CTRSM,  CLACGV,
+     $          XERBLA
+*
+      EXTERNAL  CGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for CGEQP3 of an M x N matrix,
+*         CGEQRF of an N x N matrix, CGELQF of an N x N matrix,
+*         CUNMLQ for computing N x N matrix, CUNMQR for computing N x N
+*         matrix, CUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for CPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for CGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_CGEQP3 = INT( CDUMMY(1) )
+              CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGEQRF = INT( CDUMMY(1) )
+              CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_CGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, 
+     $                              N+LWRK_CGEQRF, LWRK_CGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N+LWRK_CGEQRF, 
+     $                              LWRK_CGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_CUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, 
+     $                         N+LWRK_CGELQF, 2*N+LWRK_CGEQRF,
+     $                         N+LWRK_CGESVJ,  N+LWRK_CUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVJ,N+LWRK_CGELQF,
+     $                         2*N+LWRK_CGEQRF, N+LWRK_CGESVJ, 
+     $                         N+LWRK_CUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_CGEQP3, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_CGEQP3N = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJ = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJU = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ, 
+     $                          2*N+N**2+N+LWRK_CGESVJV,               
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,                  
+     $                          N+LWRK_CUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_CGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ,               
+     $                          2*N+N**2+N+LWRK_CGESVJV, 
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,
+     $                          N+LWRK_CUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMQR = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_CUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON,   
+     $                           2*N+LWRK_CGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_CGESVJV,  
+     $                           2*N+N**2+N+LWRK_CUNMQR,N+LWRK_CUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_CGEQP3, 2*N+LWRK_CGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_CGESVJV, 
+     $                           2*N+N**2+N+LWRK_CUNMQR, 
+     $                           N+LWRK_CUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( OPTWRK, MINWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'CGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure SLAMCH() does not fail on the target architecture.
+*
+      EPSLN = SLAMCH('Epsilon')
+      SFMIN = SLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = SLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(REAL(M)*REAL(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL CLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'CGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL SSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL CLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL CLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL CLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL CLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL CGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL CUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL CCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL CLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              CLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL SLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * ALOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / ALOG(REAL(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * ALOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / ALOG(REAL(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.  
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p) = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that CGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then CGESVJ will compute them. So, in that case,
+*     one should use CGESVJ instead of CGEJSV.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / REAL(N) )
+*     >> for future updates: allow bigger range, i.e. the largest column
+*     will be allowed up to BIG/N and CGESVJ will do the rest. However, for
+*     this all other (LAPACK) components must allow such a range.      
+*     TEMP1  = BIG/REAL(N)
+*     TEMP1  = BIG * EPSLN  this should 'almost' work with current LAPACK components
+      CALL SLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL CLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using CGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL CLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = ISAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL CLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use CGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of CGEQP3 improves overall performance of CGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then CGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(REAL(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - REAL(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL CPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL CLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL CSSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL CPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL CPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL CCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL CLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 4947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL CGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL CCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL CLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 1947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL CGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) 
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL CGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL CLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL CGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL CLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL CGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL CCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL CLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL CGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL CLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL CLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL CUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL CCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL CLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL CLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL CGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL CCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / SCNRM2( M, U(1,p), 1 )
+            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of CGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL CLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = SCNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL CSSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. REAL(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(REAL(N))
+*
+            COND_OK = SQRT(SQRT(REAL(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL CCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL CLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to CGEQP3
+*              should be replaced with eg. CALL CGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL CGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL CLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL CGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL CLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = SCNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL CPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL CGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL CCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in CGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL CTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in CGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL CTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL CUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL CTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that CGEJSV completes the task.
+*              Compute the full SVD of L3 using CGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL CGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL CUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL CLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(REAL(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL CLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL CGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL CCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL CSSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL CTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL CCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(REAL(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / SCNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL CLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL CLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL CLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(REAL(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL CLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL CLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL CCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL CLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = CMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                           ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL CGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL CLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL CSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL SLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF CGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/cgesvx.f b/lapack-netlib/cgesvx.f
new file mode 100644
index 000000000..74a37e9a0
--- /dev/null
+++ b/lapack-netlib/cgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> CGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by CGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by CGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsolve
+*
+*  =====================================================================
+      SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGE, CLANTR, SLAMCH
+      EXTERNAL           LSAME, CLANGE, CLANTR, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGECON, CGEEQU, CGERFS, CGETRF, CGETRS, CLACPY,
+     $                   CLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL CLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL CGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = CLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = CLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = CLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGESVX
+*
+      END
diff --git a/lapack-netlib/dgbsvx.f b/lapack-netlib/dgbsvx.f
new file mode 100644
index 000000000..0ee5eecb3
--- /dev/null
+++ b/lapack-netlib/dgbsvx.f
@@ -0,0 +1,639 @@
+*> \brief <b> DGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is DOUBLE PRECISION array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by DGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by DGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGBsolve
+*
+*  =====================================================================
+      SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGB, DLANTB
+      EXTERNAL           LSAME, DLAMCH, DLANGB, DLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGBCON, DGBEQU, DGBRFS, DGBTRF, DGBTRS,
+     $                   DLACPY, DLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL DCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL DGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = DLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = DLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of DGBSVX
+*
+      END
diff --git a/lapack-netlib/dgejsv.f b/lapack-netlib/dgejsv.f
new file mode 100644
index 000000000..ee769bb38
--- /dev/null
+++ b/lapack-netlib/dgejsv.f
@@ -0,0 +1,1780 @@
+*> \brief \b DGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                          M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                          WORK, LWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       IMPLICIT    NONE
+*       INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+*      $            WORK( LWORK )
+*       INTEGER     IWORK( * )
+*       CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEJSV computes the singular value decomposition (SVD) of a real M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^t,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+*> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> DGEJSV can sometimes compute tiny singular values and their singular vectors much
+*> more accurately than other SVD routines, see below under Further Details.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>        Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>             with well-conditioned B and arbitrary diagonal matrix D.
+*>             The accuracy cannot be spoiled by COLUMN scaling. The
+*>             accuracy of the computed output depends on the condition of
+*>             B, and the procedure aims at the best theoretical accuracy.
+*>             The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>             bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>             The input matrix is preprocessed with the QRF with column
+*>             pivoting. This initial preprocessing and preconditioning by
+*>             a rank revealing QR factorization is common for all values of
+*>             JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>             condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>             D1, D2, and well-conditioned matrix C, this option gives
+*>             higher accuracy than the 'C' option. If the structure of the
+*>             input matrix is not known, and relative accuracy is
+*>             desirable, then this option is advisable. The input matrix A
+*>             is preprocessed with QR factorization with FULL (row and
+*>             column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>             condition number of B, where A=D*B. If A has heavily weighted
+*>             rows, then using this condition number gives too pessimistic
+*>             error bound.
+*>       = 'A': Small singular values are the noise and the matrix is treated
+*>             as numerically rank deficient. The error in the computed
+*>             singular values is bounded by f(m,n)*epsilon*||A||.
+*>             The computed SVD A = U * S * V^t restores A up to
+*>             f(m,n)*epsilon*||A||.
+*>             This gives the procedure the licence to discard (set to zero)
+*>             all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>             QR factorization is used do reveal (using triangular factor)
+*>             a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>             numerical RANK is declared to be r. The SVD is computed with
+*>             absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>        Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>             of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>        Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>             are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>             computed as the product of Jacobi rotations. This option is
+*>             allowed only if JOBU .NE. 'N', i.e. in computing the full SVD.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>             of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>        Specifies the RANGE for the singular values. Issues the licence to
+*>        set to zero small positive singular values if they are outside
+*>        specified range. If A .NE. 0 is scaled so that the largest singular
+*>        value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>        the licence to kill columns of A whose norm in c*A is less than
+*>        DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>        where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>             BLAS and QR factorizations and triangular solvers are
+*>             implemented to work in that range. If the condition of A
+*>             is greater than BIG, use DGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [DSQRT(SFMIN), DSQRT(BIG)]
+*>             (roughly, as described above). This option is recommended.
+*>                                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*>        For computing the singular values in the FULL range [SFMIN,BIG]
+*>        use DGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>        If the matrix is square then the procedure may determine to use
+*>        transposed A if A^t seems to be better with respect to convergence.
+*>        If the matrix is not square, JOBT is ignored. This is subject to
+*>        changes in the future.
+*>        The decision is based on two values of entropy over the adjoint
+*>        orbit of A^t * A. See the descriptions of WORK(6) and WORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>        convergence of Jacobi process if A^t is taken as input. If A is
+*>        replaced with A^t, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>        This option can be used to compute only the singular values, or the
+*>        full SVD (U, SIGMA and V). For only one set of singular vectors
+*>        (U or V), the caller should provide both U and V, as one of the
+*>        matrices is used as workspace if the matrix A is transposed.
+*>        The implementer can easily remove this constraint and make the
+*>        code more complicated. See the descriptions of U and V.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>        Issues the licence to introduce structured perturbations to drown
+*>        denormalized numbers. This licence should be active if the
+*>        denormals are poorly implemented, causing slow computation,
+*>        especially in cases of fast convergence (!). For details see [1,2].
+*>        For the sake of simplicity, this perturbations are included only
+*>        when the full SVD or only the singular values are requested. The
+*>        implementer/user can easily add the perturbation for the cases of
+*>        computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
+*>            computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For WORK(1) .NE. WORK(2): The singular values of A are
+*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^t. In that case, [V] is computed
+*>                         in U as left singular vectors of A^t and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^t. In that case, [U] is computed
+*>                         in V as right singular vectors of A^t and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK))
+*>          On exit, if N > 0 .AND. M > 0 (else not referenced),
+*>          WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such
+*>                    that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          WORK(2) = See the description of WORK(1).
+*>          WORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*>                    It is computed using DPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          WORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          WORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>
+*>          WORK(6) = the entropy of A^t*A :: this is the Shannon entropy
+*>                    of diag(A^t*A) / Trace(A^t*A) taken as point in the
+*>                    probability simplex.
+*>          WORK(7) = the entropy of A*A^t.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of WORK to confirm proper allocation of work space.
+*>          LWORK depends on the job:
+*>
+*>          If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and
+*>            -> .. no scaled condition estimate required (JOBE = 'N'):
+*>               LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal
+*>               block size for DGEQP3 and DGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7).
+*>            -> .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', 'G'). In this case, LWORK is the maximum
+*>               of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7).
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7).
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF),
+*>                                                     N+N*N+LWORK(DPOCON),7).
+*>
+*>          If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF,
+*>               DORMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON),
+*>                       N+LWORK(DGELQF), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)).
+*>
+*>          If SIGMA and the left singular vectors are needed
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON),
+*>                        2*N+LWORK(DGEQRF), N+LWORK(DORMQR)).
+*>               Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or
+*>               M*NB (for JOBU = 'F').
+*>
+*>          If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            -> if JOBV = 'V'
+*>               the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N).
+*>            -> if JOBV = 'J' the minimal requirement is
+*>               LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6).
+*>            -> For optimal performance, LWORK should be additionally
+*>               larger than N+M*NB, where NB is the optimal block size
+*>               for DORMQR.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  DGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  DGEJSV implements a preconditioned Jacobi SVD algorithm. It uses DGEQP3,
+*>  DGEQRF, and DGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by DGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (DGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (DGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: DGEQP3) should be
+*>  implemented as in [3]. We have a new version of DGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in DGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of DGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributors:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   WORK, LWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+     $            WORK( LWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION   ZERO,  ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL, JRACC,  KILL,   LSVEC,
+     $        L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN,
+     $        NOSCAL, ROWPIV, RSVEC,  TRANSP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC DABS, DLOG, MAX, MIN, DBLE, IDNINT, DSIGN, DSQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION  DLAMCH, DNRM2
+      INTEGER   IDAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, LSAME, DLAMCH, DNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DCOPY,  DGELQF, DGEQP3, DGEQRF, DLACPY, DLASCL,
+     $          DLASET, DLASSQ, DLASWP, DORGQR, DORMLQ,
+     $          DORMQR, DPOCON, DSCAL,  DSWAP,  DTRSM,  XERBLA
+*
+      EXTERNAL  DGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC  .OR. LSAME( JOBU, 'N' ) .OR.
+     $                             LSAME( JOBU, 'W' )) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   LSAME( JOBV, 'W' )) .OR. ( JRACC .AND. (.NOT.LSVEC) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( L2TRAN .OR. LSAME( JOBT, 'N' ) ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE IF ( (.NOT.(LSVEC .OR. RSVEC .OR. ERREST).AND.
+     &                           (LWORK .LT. MAX(7,4*N+1,2*M+N))) .OR.
+     & (.NOT.(LSVEC .OR. RSVEC) .AND. ERREST .AND.
+     &                         (LWORK .LT. MAX(7,4*N+N*N,2*M+N))) .OR.
+     & (LSVEC .AND. (.NOT.RSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (RSVEC .AND. (.NOT.LSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (LSVEC .AND. RSVEC .AND. (.NOT.JRACC) .AND.
+     &                          (LWORK.LT.MAX(2*M+N,6*N+2*N*N)))
+     & .OR. (LSVEC .AND. RSVEC .AND. JRACC .AND.
+     &                          LWORK.LT.MAX(2*M+N,4*N+N*N,2*N+N*N+6)))
+     &   THEN
+         INFO = - 17
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'DGEJSV', - INFO )
+         RETURN
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:3) = 0
+         WORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / DSQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL DLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'DGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = DSQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL DLASET( 'G', M, N1, ZERO, ONE, U, LDU )
+         IF ( RSVEC ) CALL DLASET( 'G', N, N,  ZERO, ONE, V, LDV )
+         WORK(1) = ONE
+         WORK(2) = ONE
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL DLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL DLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+               CALL DGEQRF( M, N, U,LDU, WORK, WORK(N+1),LWORK-N,IERR )
+               CALL DORGQR( M,N1,1, U,LDU,WORK,WORK(N+1),LWORK-N,IERR )
+               CALL DCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = ONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         WORK(1) = ONE / SCALEM
+         WORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+      L2TRAN = L2TRAN .AND. ( M .EQ. N )
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^t and A^t * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL DLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              DLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               WORK(M+N+p)  = XSC * SCALEM
+               WORK(N+p)    = XSC * (SCALEM*DSQRT(TEMP1))
+               AATMAX = MAX( AATMAX, WORK(N+p) )
+               IF (WORK(N+p) .NE. ZERO) AATMIN = MIN(AATMIN,WORK(N+p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               WORK(M+N+p) = SCALEM*DABS( A(p,IDAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, WORK(M+N+p) )
+               AATMIN = MIN( AATMIN, WORK(M+N+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^t  would be  better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^t * A.  Do the same with the
+*        diagonal of A * A^t, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^t and A^t * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = N+1, N+M
+            BIG1 = ( ( WORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^t. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+*
+*        If A^t is better than A, transpose A.
+*
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               DO 1116 q = p + 1, N
+                   TEMP1 = A(q,p)
+                  A(q,p) = A(p,q)
+                  A(p,q) = TEMP1
+ 1116          CONTINUE
+ 1115       CONTINUE
+            DO 1117 p = 1, N
+               WORK(M+N+p) = SVA(p)
+               SVA(p)      = WORK(N+p)
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than DSQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
+*     DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
+*     one should use DGESVJ instead of DGEJSV.
+*
+      BIG1   = DSQRT( BIG )
+      TEMP1  = DSQRT( BIG / DBLE(N) )
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
+         XSC = DSQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. DSQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using DGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.DSQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL DLASET( 'A', M, 1, ZERO, ZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, WORK(M+N+p), 1 ) + p - 1
+            IWORK(2*N+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1       = WORK(M+N+p)
+               WORK(M+N+p) = WORK(M+N+q)
+               WORK(M+N+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL DLASWP( N, A, LDA, 1, M-1, IWORK(2*N+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in SGEQPX from TOMS # 782). Good results will be obtained using
+*     SGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of DGEQP3 improves overall performance of DGEJSV.
+*
+*     A * P1 = Q1 * [ R1^t 0]^t:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL DGEQP3( M,N,A,LDA, IWORK,WORK, WORK(N+1),LWORK-N, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then DGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = DSQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( DABS(A(p,p)) .GE. (TEMP1*DABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = DSQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. (EPSLN*DABS(A(p-1,p-1))) ) .OR.
+     $           ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = DSQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $          ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = DABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               CALL DPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL DPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE
+               CALL DLACPY( 'U', N, N, A, LDA, WORK(N+1), N )
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, WORK(N+(p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+               CALL DPOCON( 'U', N, WORK(N+1), N, ONE, TEMP1,
+     $              WORK(N+N*N+1), IWORK(2*N+M+1), IERR )
+            END IF
+            SCONDA = ONE / DSQRT(TEMP1)
+*           SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( DABS( A(1,1)/A(NR,NR) ) .GT. DSQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL DCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1946    CONTINUE
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+     $                     A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL DGEQRF( N,NR, A,LDA, WORK, WORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL DCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+     $                   A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL DGESVJ( 'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA,
+     $                      N, V, LDV, WORK, LWORK, INFO )
+*
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+*
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'L','U','N', N, NR, V,LDV, SVA, NR, A,LDA,
+     $                  WORK, LWORK, INFO )
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL DLASET( 'Lower', NR-1, NR-1, ZERO, ZERO, A(2,1), LDA )
+            CALL DGELQF( NR, N, A, LDA, WORK, WORK(N+1), LWORK-N, IERR)
+            CALL DLACPY( 'Lower', NR, NR, A, LDA, V, LDV )
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            CALL DGEQRF( NR, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL DCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'Lower', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, WORK(N+1), LWORK, INFO )
+            SCALEM  = WORK(N+1)
+            NUMRANK = IDNINT(WORK(N+2))
+            IF ( NR .LT. N ) THEN
+               CALL DLASET( 'A',N-NR, NR, ZERO,ZERO, V(NR+1,1),   LDV )
+               CALL DLASET( 'A',NR, N-NR, ZERO,ZERO, V(1,NR+1),   LDV )
+               CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE, V(NR+1,NR+1), LDV )
+            END IF
+*
+         CALL DORMLQ( 'Left', 'Transpose', N, N, NR, A, LDA, WORK,
+     $               V, LDV, WORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*
+         DO 8991 p = 1, N
+            CALL DCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+ 8991    CONTINUE
+         CALL DLACPY( 'All', N, N, A, LDA, V, LDV )
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, V, LDV, U, LDU )
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGEQRF( N, NR, U, LDU, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL DCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+ 1967    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGESVJ( 'Lower', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, WORK(N+1), LWORK-N, INFO )
+         SCALEM  = WORK(N+1)
+         NUMRANK = IDNINT(WORK(N+2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR,ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1), LDU )
+               CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1), LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $               LDU, WORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DNRM2( M, U(1,p), 1 )
+            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of DGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 2969 q = 1, NR
+                  TEMP1 = XSC*DABS( V(q,q) )
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+     $                   V(p,q) = DSIGN( TEMP1, V(p,q) )
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL DLACPY( 'L', NR, NR, V, LDV, WORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DNRM2(NR-p+1,WORK(2*N+(p-1)*NR+p),1)
+               CALL DSCAL(NR-p+1,ONE/TEMP1,WORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1,
+     $                   WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR)
+            CONDR1 = ONE / DSQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. DSQRT(DBLE(N))
+*
+            COND_OK = DSQRT(DBLE(NR))
+*[TP]       COND_OK is a tuning parameter.
+
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^t = Q2 * R2
+               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL DCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+ 1969          CONTINUE
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to DGEQP3
+*              should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^t * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL DGEQP3( N, NR, V, LDV, IWORK(N+1), WORK(N+1),
+     $                  WORK(2*N+1), LWORK-2*N, IERR )
+**               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        V(p,q) = - DSIGN( TEMP1, V(q,p) )
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL DLASET( 'L',NR-1,NR-1,ZERO,ZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL DGELQF( NR, NR, V, LDV, WORK(2*N+N*NR+1),
+     $               WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL DLACPY( 'L',NR,NR,V,LDV,WORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DNRM2( p, WORK(2*N+N*NR+NR+p), NR )
+                  CALL DSCAL( p, ONE/TEMP1, WORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL DPOCON( 'L',NR,WORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              WORK(2*N+N*NR+NR+NR*NR+1),IWORK(M+2*N+1),IERR )
+               CONDR2 = ONE / DSQRT(TEMP1)
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 4968 q = 2, NR
+                  TEMP1 = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                    V(p,q) = - DSIGN( TEMP1, V(q,p) )
+                     V(p,q) = - DSIGN( TEMP1, V(p,q) )
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL DGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U,
+     $              LDU,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3970 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in DGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL DTRSM( 'L','U','N','N', NR,NR,ONE, A,LDA, V,LDV )
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Transpose(R2)
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in DGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL DTRSM('L','U','T','N',NR,NR,ONE,WORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                    CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                    CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                    CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL DORMQR('L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $                 V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+* :)           .. the input matrix A is very likely a relative of
+*              the Kahan matrix :)
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3870 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL DTRSM('L','U','N','N',NR,NR,ONE,WORK(2*N+1),N,U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that DGEJSV completes the task.
+*              Compute the full SVD of L3 using DGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL DORMLQ( 'L', 'T', NR, NR, NR, WORK(2*N+1), N,
+     $              WORK(2*N+N*NR+1), U, LDU, WORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+               IF ( NR .LT. N1 ) THEN
+                  CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                  CALL DLASET('A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = DSQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL DLACPY( 'Upper', N, N, A, LDA, WORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 5970 p = 2, N
+                  TEMP1 = XSC * WORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+                     WORK(N+(q-1)*N+p)=-DSIGN(TEMP1,WORK(N+(p-1)*N+q))
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL DLASET( 'Lower',N-1,N-1,ZERO,ZERO,WORK(N+2),N )
+            END IF
+*
+            CALL DGESVJ( 'Upper', 'U', 'N', N, N, WORK(N+1), N, SVA,
+     $           N, U, LDU, WORK(N+N*N+1), LWORK-N-N*N, INFO )
+*
+            SCALEM  = WORK(N+N*N+1)
+            NUMRANK = IDNINT(WORK(N+N*N+2))
+            DO 6970 p = 1, N
+               CALL DCOPY( N, WORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL DSCAL( N, SVA(p), WORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
+     $           ONE, A, LDA, WORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL DCOPY( N, WORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = DSQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL DLASET( 'A',  M-N, N, ZERO, ZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL DLASET( 'A',N,  N1-N, ZERO, ZERO,  U(1,N+1),LDU )
+                  CALL DLASET( 'A',M-N,N1-N, ZERO, ONE,U(N+1,N+1),LDU )
+               END IF
+            END IF
+            CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+            TEMP1 = DSQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values. Since that is not always the case, ...
+*
+         DO 7968 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               TEMP1 = XSC*DABS( V(q,q) )
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+     $                V(p,q) = DSIGN( TEMP1, V(p,q) )
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+         END IF
+
+         CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL DLACPY( 'L', N, NR, V, LDV, WORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  TEMP1 = XSC * MIN(DABS(U(p,p)),DABS(U(q,q)))
+                  U(p,q) = - DSIGN( TEMP1, U(q,p) )
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL DLASET('U', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+         END IF
+
+         CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, WORK(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
+         SCALEM  = WORK(2*N+N*NR+1)
+         NUMRANK = IDNINT(WORK(2*N+N*NR+2))
+
+         IF ( NR .LT. N ) THEN
+            CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+            CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+            CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $        V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR,  N1-NR, ZERO, ZERO,  U(1,NR+1),LDU )
+               CALL DLASET( 'A',M-NR,N1-NR, ZERO, ONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $        LDU, WORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^t
+            DO 6974 p = 1, N
+               CALL DSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      WORK(1) = USCAL2 * SCALEM
+      WORK(2) = USCAL1
+      IF ( ERREST ) WORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         WORK(4) = CONDR1
+         WORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         WORK(6) = ENTRA
+         WORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+*
+      RETURN
+*     ..
+*     .. END OF DGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/dgesvx.f b/lapack-netlib/dgesvx.f
new file mode 100644
index 000000000..f787488dc
--- /dev/null
+++ b/lapack-netlib/dgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> DGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is DOUBLE PRECISION array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by DGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by DGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsolve
+*
+*  =====================================================================
+      SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANTR
+      EXTERNAL           LSAME, DLAMCH, DLANGE, DLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGECON, DGEEQU, DGERFS, DGETRF, DGETRS, DLACPY,
+     $                   DLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL DLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL DGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = DLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = DLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = DLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+      WORK( 1 ) = RPVGRW
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+      RETURN
+*
+*     End of DGESVX
+*
+      END
diff --git a/lapack-netlib/sgbsvx.f b/lapack-netlib/sgbsvx.f
new file mode 100644
index 000000000..df3a721d9
--- /dev/null
+++ b/lapack-netlib/sgbsvx.f
@@ -0,0 +1,641 @@
+*> \brief <b> SGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is REAL array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by SGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by SGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGBsolve
+*
+*  =====================================================================
+      SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGB, SLANTB
+      EXTERNAL           LSAME, SLAMCH, SLANGB, SLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGBCON, SGBEQU, SGBRFS, SGBTRF, SGBTRS,
+     $                   SLACPY, SLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL SCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL SGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = SLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = SLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGBSVX
+*
+      END
diff --git a/lapack-netlib/sgesvx.f b/lapack-netlib/sgesvx.f
new file mode 100644
index 000000000..385e626cf
--- /dev/null
+++ b/lapack-netlib/sgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> SGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is REAL array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by SGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by SGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGEsolve
+*
+*  =====================================================================
+      SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGE, SLANTR
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGECON, SGEEQU, SGERFS, SGETRF, SGETRS, SLACPY,
+     $                   SLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL SLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL SGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = SLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = SLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = SLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGESVX
+*
+      END
diff --git a/lapack-netlib/zgbsvx.f b/lapack-netlib/zgbsvx.f
new file mode 100644
index 000000000..871564a81
--- /dev/null
+++ b/lapack-netlib/zgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> ZGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX*16 array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by ZGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by ZGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GBsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGB, ZLANTB
+      EXTERNAL           LSAME, DLAMCH, ZLANGB, ZLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZCOPY, ZGBCON, ZGBEQU, ZGBRFS, ZGBTRF,
+     $                   ZGBTRS, ZLACPY, ZLAQGB
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL ZCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL ZGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = ZLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = ZLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGBSVX
+*
+      END
diff --git a/lapack-netlib/zgejsv.f b/lapack-netlib/zgejsv.f
new file mode 100644
index 000000000..5fe899e50
--- /dev/null
+++ b/lapack-netlib/zgejsv.f
@@ -0,0 +1,2234 @@
+*> \brief \b ZGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX*16     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     DOUBLE PRECISION   SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use ZGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use ZGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored. 
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX*16 array, dimension ( LDU, N )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX*16 array, dimension (MAX(2,LWORK))
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for ZGEQP3 and ZGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ)).
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ),
+*>                            N*N+LWORK(ZPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON),
+*>                        2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)).
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and 
+*>            4.1. if JOBV = 'V'  
+*>               the minimal requirement is LWORK >= 5*N+2*N*N. 
+*>            4.2. if JOBV = 'J' the minimal requirement is 
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ.
+*>
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using ZPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*>
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends 
+*>          on the job:
+*>
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  ZGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  ZGEJSV implements a preconditioned Jacobi SVD algorithm. It uses ZGEQP3,
+*>  ZGEQRF, and ZGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by ZGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (ZGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (ZGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: ZGEQP3) should be
+*>  implemented as in [3]. We have a new version of ZGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in ZGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of ZGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac, Department of Mathematics, Faculty of Science,
+*>  University of Zagreb (Zagreb, Croatia); drmac@math.hr
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16       A( LDA, * ), U( LDU, * ), V( LDV, * ),
+     $                 CWORK( LWORK )
+      DOUBLE PRECISION SVA( N ), RWORK( LRWORK )
+      INTEGER          IWORK( * )
+      CHARACTER*1      JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION ZERO, ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16 CZERO, CONE
+      PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX*16       CTEMP
+      DOUBLE PRECISION AAPP,    AAQQ,   AATMAX, AATMIN, BIG,    BIG1,
+     $                 COND_OK, CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,
+     $                 MAXPRJ,  SCALEM, SCONDA, SFMIN,  SMALL,  TEMP1,
+     $                 USCAL1,  USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_ZGELQF, LWRK_ZGEQP3,  LWRK_ZGEQP3N, LWRK_ZGEQRF,  
+     $        LWRK_ZGESVJ, LWRK_ZGESVJV, LWRK_ZGESVJU, LWRK_ZUNMLQ, 
+     $        LWRK_ZUNMQR, LWRK_ZUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX*16         CDUMMY(1)
+      DOUBLE PRECISION   RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, DCMPLX, CONJG, DLOG, MAX, MIN, DBLE, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION      DLAMCH, DZNRM2
+      INTEGER   IDAMAX, IZAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, IZAMAX, LSAME, DLAMCH, DZNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DLASSQ, ZCOPY,  ZGELQF, ZGEQP3, ZGEQRF, ZLACPY, ZLAPMR,
+     $          ZLASCL, DLASCL, ZLASET, ZLASSQ, ZLASWP, ZUNGQR, ZUNMLQ,
+     $          ZUNMQR, ZPOCON, DSCAL,  ZDSCAL, ZSWAP,  ZTRSM,  ZLACGV,
+     $          XERBLA
+*
+      EXTERNAL  ZGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for ZGEQP3 of an M x N matrix,
+*         ZGEQRF of an N x N matrix, ZGELQF of an N x N matrix,
+*         ZUNMLQ for computing N x N matrix, ZUNMQR for computing N x N
+*         matrix, ZUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for ZPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for ZGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_ZGEQP3 = INT( CDUMMY(1) )
+              CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGEQRF = INT( CDUMMY(1) )
+              CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_ZGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, 
+     $                              N+LWRK_ZGEQRF, LWRK_ZGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWRK_ZGEQRF, 
+     $                              LWRK_ZGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, 
+     $                         N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF,
+     $                         N+LWRK_ZGESVJ,  N+LWRK_ZUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVJ,N+LWRK_ZGELQF,
+     $                         2*N+LWRK_ZGEQRF, N+LWRK_ZGESVJ, 
+     $                         N+LWRK_ZUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_ZGEQP3N = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJ = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJU = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ, 
+     $                          2*N+N**2+N+LWRK_ZGESVJV,               
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,                  
+     $                          N+LWRK_ZUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_ZGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ,               
+     $                          2*N+N**2+N+LWRK_ZGESVJV, 
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,
+     $                          N+LWRK_ZUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQR = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON,   
+     $                           2*N+LWRK_ZGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_ZGESVJV,  
+     $                           2*N+N**2+N+LWRK_ZUNMQR,N+LWRK_ZUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, 2*N+LWRK_ZGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_ZGESVJV, 
+     $                           2*N+N**2+N+LWRK_ZUNMQR, 
+     $                           N+LWRK_ZUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( MINWRK, OPTWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'ZGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL ZLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'ZGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL ZLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL ZLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL ZLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL ZLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL ZGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL ZUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL ZCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL ZLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              ZLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,IZAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p)     = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that ZGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then ZGESVJ will compute them. So, in that case,
+*     one should use ZGESVJ instead of ZGEJSV.
+*     >> change in the April 2016 update: allow bigger range, i.e. the
+*     largest column is allowed up to BIG/N and ZGESVJ will do the rest.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / DBLE(N) ) 
+*      TEMP1  = BIG/DBLE(N)
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL ZLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using ZGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL ZLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use ZGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of ZGEQP3 improves overall performance of ZGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then ZGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL ZPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL ZLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL ZDSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL ZPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL ZPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL ZCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL ZGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL ZCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL ZLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+      END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL ZGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) )
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL ZGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL ZLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL ZGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL ZLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL ZGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL ZCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL ZLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL ZGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL ZLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL ZLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL ZUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL ZCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL ZLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL ZLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL ZGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL ZCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DZNRM2( M, U(1,p), 1 )
+            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of ZGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL ZLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DZNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL ZDSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(DBLE(N))
+*
+            COND_OK = SQRT(SQRT(DBLE(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL ZCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL ZLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to ZGEQP3
+*              should be replaced with eg. CALL ZGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL ZGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL ZLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL ZGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL ZLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DZNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL ZPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL ZGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL ZCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in ZGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL ZTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in ZGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL ZTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL ZUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL ZTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that ZGEJSV completes the task.
+*              Compute the full SVD of L3 using ZGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL ZGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL ZUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL ZLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL ZLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL ZGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL ZCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL ZDSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL ZTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL ZCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DZNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL ZLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL ZLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL ZLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL ZLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL ZCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL ZLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = DCMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                            ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL ZGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL ZLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL ZSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF ZGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/zgesvx.f b/lapack-netlib/zgesvx.f
new file mode 100644
index 000000000..3b193a1b2
--- /dev/null
+++ b/lapack-netlib/zgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> ZGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX*16 array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by ZGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by ZGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANTR
+      EXTERNAL           LSAME, DLAMCH, ZLANGE, ZLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZGECON, ZGEEQU, ZGERFS, ZGETRF, ZGETRS,
+     $                   ZLACPY, ZLAQGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL ZLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL ZGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = ZLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ZLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = ZLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGESVX
+*
+      END
diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c
index 80c66dd7a..5795797d3 100644
--- a/lapack/getf2/getf2_k.c
+++ b/lapack/getf2/getf2_k.c
@@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  a + j, lda, b, 1, b + j, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
       temp1 = *(b + jp);
diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c
index e3d53c96f..6a2137b3e 100644
--- a/lapack/getf2/zgetf2_k.c
+++ b/lapack/getf2/zgetf2_k.c
@@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j * 2, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
 
diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index 29364cc05..a7c28f4c2 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -80,10 +80,6 @@ static FLOAT dm1 = -1.;
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef LOWER
 #define TRANS
 #endif
diff --git a/param.h b/param.h
index f1f5cbdad..03bf3624f 100644
--- a/param.h
+++ b/param.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2014, The OpenBLAS Project
+Copyright (c) 2011-2023, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -2845,31 +2845,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
+#if defined(NO_LASX)
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 2
 #define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 2
+#else
 #define DGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 16
+#endif
+
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
-#define SGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 1
 #define ZGEMM_DEFAULT_UNROLL_M 1
 #define XGEMM_DEFAULT_UNROLL_M 1
 
-#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_P 256
 #define DGEMM_DEFAULT_P 32
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_R 12288
+#define SGEMM_DEFAULT_R 1024
 #define DGEMM_DEFAULT_R 858
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#define SGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 256
 #define DGEMM_DEFAULT_Q 152
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 128
@@ -3338,6 +3346,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN1)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -3365,21 +3379,27 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(NEOVERSEV1)
+#elif defined(NEOVERSEV1) // 256-bit SVE
 
-#define SWITCH_RATIO  16
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
 
 #define SGEMM_DEFAULT_UNROLL_M  16
-#define SGEMM_DEFAULT_UNROLL_N  4
+#define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
+#define DGEMM_DEFAULT_UNROLL_M  4 // Actually 2VL (8) but kept separate to keep copies separate
+#define DGEMM_DEFAULT_UNROLL_N  8
 
-#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 16
 
-#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
 
 #define SGEMM_DEFAULT_P 128
 #define DGEMM_DEFAULT_P 160
@@ -3398,6 +3418,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN2)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #undef SBGEMM_ALIGN_K
 #define SBGEMM_ALIGN_K 4
 
@@ -3433,7 +3459,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
+#elif defined(A64FX) // 512-bit SVE
 
 /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
 Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@@ -3474,6 +3500,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
+#define SGEMM_DEFAULT_UNROLL_M  4 // Actually 1VL (8) but kept seperate to keep copies seperate
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  16
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  16
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #else /* Other/undetected ARMv8 cores */
 
 #define SGEMM_DEFAULT_UNROLL_M  16
@@ -3838,6 +3901,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #endif
 
+#ifndef SWITCH_RATIO
+#define SWITCH_RATIO 2
+#endif
+
 #ifndef QGEMM_DEFAULT_UNROLL_M
 #define QGEMM_DEFAULT_UNROLL_M 2
 #endif
diff --git a/test/Makefile b/test/Makefile
index 923f1537c..46a7b1158 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -265,7 +265,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)
diff --git a/utest/test_axpy.c b/utest/test_axpy.c
index 5fd7c1b04..26005e70f 100644
--- a/utest/test_axpy.c
+++ b/utest/test_axpy.c
@@ -74,6 +74,26 @@ CTEST(axpy,zaxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,zaxpy_incx_0)
+{
+        blasint i;
+        blasint N=4,incX=0,incY=1;
+        double a[2]={0.25,0.5};
+        double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+        //OpenBLAS
+        BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY);
+
+        for(i=0; i<2*N; i++){
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+                ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+        }
+}
+
 #endif
 
 #ifdef BUILD_SINGLE
@@ -116,5 +136,24 @@ CTEST(axpy,caxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,caxpy_incx_0)
+{
+	blasint i;
+	blasint N=4,incX=0,incY=1;
+	float a[2]={0.25,0.5};
+	float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+	float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+	//OpenBLAS
+	BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY);
+
+	for(i=0; i<2*N; i++){
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+	}
+}
 #endif